Question

我正在为一个nlp项目工作。作为训练数据集，我使用圣经。如果您想自己尝试，可以轻松创建随机语料库：

rcorpus(nwords = 50, alphabet = letters, minwordlen = 1, maxwordlen = 6)

处理完文本文件后，我用ngram-package将语料库划分为n-gram

library(ngram)
# this is a preproccsed Corpus I have created earlier

bible_corpus<- Corpus(DirSource("C:/Users/XYZ/XYZ/"))

现在我正在使用我之前设置过的函数处理语料库。

corpus_sentences <- Text_To_Clean_Sentences(paste(bible_corpus, collapse=" "))

下一步是创建一个将我们的语料库分成ngram

的函数

# function for getting n-grams
Get_Ngrams <- function(sentence_splits, ngram_size=2) {
ngrams <- c()
for (sentence in sentence_splits) {
sentence <- Trim(sentence)
if ((nchar(sentence) > 0) && (sapply(gregexpr("\\W+", sentence), length) >= 
ngram_size)) {
    ngs <- ngram(sentence , n=ngram_size)
    ngrams <- c(ngrams, get.ngrams(ngs))
     }
}
 return (ngrams)
}

# making n-grams based on Get_Ngrams
n2 <- Get_Ngrams(corpus_sentences, ngram_size=2)   
n3 <- Get_Ngrams(corpus_sentences, ngram_size=3)
n4 <- Get_Ngrams(corpus_sentences, ngram_size=4)
n5 <- Get_Ngrams(corpus_sentences, ngram_size=5)

# collect all n-grams
n_all <- c(n5,n4,n3,n2)

输入搜索字词的时间

# enter SEARCH Word
word <- 'good '


#
matches <- c()
for (sentence in n_all) {
# find exact match with double backslash and escape
if (grepl(paste0('\\<',word), sentence)) {
print(sentence)
matches <- c(matches, sentence)
}
}

# find highest probability word
precision_match <- c()
for (a_match in matches) {
# how many spaces in from of search word
precision_match <- c(precision_match,nchar(strsplit(x = a_match, split = 
word)[[1]][[1]]))
}

最后一步返回所有ngrams，其中包含我们从行搜索的单词 29，现在我想删除所有不以我们输入的搜索词开头的句子。

例如＆＃34; precision_match＆＃34;返回：

[1] search_word wordX wordY wordZ
[2] search_word wordY wordX wordZ
[3] wordY search_word wordX wordZ
[4] wordY wordX wordZ search_word

当然我可以手动选择[1]和[2]，因为我可以看到这两行以search_word开头。但这并不是很多比赛的实用。那么如何从我们的search_word开始提取n-gram？

Answer 1

我运行您的代码并且不知道您为何使用precision_match...的最后一部分它基本上为您提供了从字符串开头搜索单词位置的差异。然而，您的问题似乎在前一步（matches）处理。

尝试

set.seed(33)
test <- rcorpus(nwords = 50, alphabet = letters, minwordlen = 1, maxwordlen = 6)

Get_Ngrams <- function(sentence_splits, ngram_size=2) {
  ngrams <- c()
  for (sentence in sentence_splits) {
    sentence <- trimws(sentence)
    if ((nchar(sentence) > 0) && (sapply(gregexpr("\\W+", sentence), length) >= 
                              ngram_size)) {
      ngs <- ngram(sentence , n=ngram_size)
      ngrams <- c(ngrams, get.ngrams(ngs))
    }
  }
  return (ngrams)
}

ngram2 <- Get_Ngrams(test,2)
ngram3 <- Get_Ngrams(test,3)
n_all <- c(ngram2,ngram3)

word <- 'ghpbw'

matches <- c()
for (sentence in n_all) {
  # find exact match with double backslash and escape
  if (grepl(paste0('\\<',word), sentence)) {
    print(sentence)
    matches <- c(matches, sentence)
  }
}



matches[grepl(pattern = paste0("^",word), matches)]

导致ngrams以搜索字ghpbw [1] "ghpbw zbaiou" "ghpbw zbaiou ffrpj"开头，丢弃[1] "wil ghpbw" "dxjv wil ghpbw" "wil ghpbw zbaiou"

根据rstudio

1 个答案: