根据rstudio

时间:2017-11-13 20:48:08

标签: r nlp n-gram

我正在为一个nlp项目工作。作为训练数据集,我使用圣经。如果您想自己尝试,可以轻松创建随机语料库:

rcorpus(nwords = 50, alphabet = letters, minwordlen = 1, maxwordlen = 6)

处理完文本文件后,我用ngram-package将语料库划分为n-gram

library(ngram)
# this is a preproccsed Corpus I have created earlier

bible_corpus<- Corpus(DirSource("C:/Users/XYZ/XYZ/"))

现在我正在使用我之前设置过的函数处理语料库。

corpus_sentences <- Text_To_Clean_Sentences(paste(bible_corpus, collapse=" "))

下一步是创建一个将我们的语料库分成ngram

的函数
# function for getting n-grams
Get_Ngrams <- function(sentence_splits, ngram_size=2) {
ngrams <- c()
for (sentence in sentence_splits) {
sentence <- Trim(sentence)
if ((nchar(sentence) > 0) && (sapply(gregexpr("\\W+", sentence), length) >= 
ngram_size)) {
    ngs <- ngram(sentence , n=ngram_size)
    ngrams <- c(ngrams, get.ngrams(ngs))
     }
}
 return (ngrams)
}

# making n-grams based on Get_Ngrams
n2 <- Get_Ngrams(corpus_sentences, ngram_size=2)   
n3 <- Get_Ngrams(corpus_sentences, ngram_size=3)
n4 <- Get_Ngrams(corpus_sentences, ngram_size=4)
n5 <- Get_Ngrams(corpus_sentences, ngram_size=5)

# collect all n-grams
n_all <- c(n5,n4,n3,n2)

输入搜索字词的时间

# enter SEARCH Word
word <- 'good '


#
matches <- c()
for (sentence in n_all) {
# find exact match with double backslash and escape
if (grepl(paste0('\\<',word), sentence)) {
print(sentence)
matches <- c(matches, sentence)
}
}

# find highest probability word
precision_match <- c()
for (a_match in matches) {
# how many spaces in from of search word
precision_match <- c(precision_match,nchar(strsplit(x = a_match, split = 
word)[[1]][[1]]))
}

最后一步返回所有ngrams,其中包含我们从行搜索的单词     29, 现在我想删除所有不以我们输入的搜索词开头的句子。

例如&#34; precision_match&#34;返回:

[1] search_word wordX wordY wordZ
[2] search_word wordY wordX wordZ
[3] wordY search_word wordX wordZ
[4] wordY wordX wordZ search_word

当然我可以手动选择[1]和[2],因为我可以看到这两行以search_word开头。但这并不是很多比赛的实用。那么如何从我们的search_word开始提取n-gram?

1 个答案:

答案 0 :(得分:0)

我运行您的代码并且不知道您为何使用precision_match...的最后一部分 它基本上为您提供了从字符串开头搜索单词位置的差异。然而,您的问题似乎在前一步(matches)处理。

尝试

set.seed(33)
test <- rcorpus(nwords = 50, alphabet = letters, minwordlen = 1, maxwordlen = 6)

Get_Ngrams <- function(sentence_splits, ngram_size=2) {
  ngrams <- c()
  for (sentence in sentence_splits) {
    sentence <- trimws(sentence)
    if ((nchar(sentence) > 0) && (sapply(gregexpr("\\W+", sentence), length) >= 
                              ngram_size)) {
      ngs <- ngram(sentence , n=ngram_size)
      ngrams <- c(ngrams, get.ngrams(ngs))
    }
  }
  return (ngrams)
}

ngram2 <- Get_Ngrams(test,2)
ngram3 <- Get_Ngrams(test,3)
n_all <- c(ngram2,ngram3)

word <- 'ghpbw'

matches <- c()
for (sentence in n_all) {
  # find exact match with double backslash and escape
  if (grepl(paste0('\\<',word), sentence)) {
    print(sentence)
    matches <- c(matches, sentence)
  }
}



matches[grepl(pattern = paste0("^",word), matches)]

导致ngrams以搜索字ghpbw [1] "ghpbw zbaiou" "ghpbw zbaiou ffrpj"开头,丢弃[1] "wil ghpbw" "dxjv wil ghpbw" "wil ghpbw zbaiou"