使用tm R找到2个单词的短语

时间:2017-08-23 17:52:05

标签: r nlp tm corpus word-frequency

我知道这已被多次询问过。例如

Finding 2 & 3 word Phrases Using R TM Package

但是,我不知道为什么这些解决方案都不适用于我的数据。无论我为ngram选择了多少ngram(2,3或4),结果总是一克字。

有人知道原因吗?我怀疑编码是原因。

编辑:一小部分数据。

comments <- c("Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into problem_70918\n", 
"Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into tm-247\n", 
"Merge branch 'php5.3-upgrade-sprint6-7' of git.internal.net:/git/pn-project/LegacyCodebase into release2012.08\n", 
"Merge remote-tracking branch 'dmann1/p71148-s3-callplan_mapping' into lcst-operational-changes\n", 
"Merge branch 'master' of git.internal.net:/git/live/LegacyCodebase into TASK-360148\n", 
"Merge remote-tracking branch 'grockett/rpr-pre' into rpr-lite\n"
)
cleanCorpus <- function(vector){
  corpus <- Corpus(VectorSource(vector), readerControl = list(language = "en_US"))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, tolower)
  #corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removePunctuation)
  #corpus <- tm_map(corpus, PlainTextDocument)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  return(corpus)
}
# this function is provided by a team member (in the link I posted above)
test <- function(keywords_doc){

  BigramTokenizer <-  function(x)
    unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
  # creating of document matrix
  keywords_matrix <- TermDocumentMatrix(keywords_doc, control = list(tokenize = BigramTokenizer))

  # remove sparse terms 
  keywords_naremoval <- removeSparseTerms(keywords_matrix, 0.99)

  # Frequency of the words appearing
  keyword.freq <- rowSums(as.matrix(keywords_naremoval))
  subsetkeyword.freq <-subset(keyword.freq, keyword.freq >=20)
  frequentKeywordSubsetDF <- data.frame(term = names(subsetkeyword.freq), freq = subsetkeyword.freq) 

  # Sorting of the words
  frequentKeywordDF <- data.frame(term = names(keyword.freq), freq = keyword.freq)
  frequentKeywordSubsetDF <- frequentKeywordSubsetDF[with(frequentKeywordSubsetDF, order(-frequentKeywordSubsetDF$freq)), ]
  frequentKeywordDF <- frequentKeywordDF[with(frequentKeywordDF, order(-frequentKeywordDF$freq)), ]

  # Printing of the words
  # wordcloud(frequentKeywordDF$term, freq=frequentKeywordDF$freq, random.order = FALSE, rot.per=0.35, scale=c(5,0.5), min.freq = 30, colors = brewer.pal(8,"Dark2"))
  return(frequentKeywordDF)
}

corpus <- cleanCorpus(comments)
t <- test(corpus)
> head(t)
             term freq
added       added    6
html         html    6
tracking tracking    6
common     common    4
emails     emails    4
template template    4

谢谢,

1 个答案:

答案 0 :(得分:1)

我还没有找到原因,但是如果你只对这些计数感兴趣,无论这些文件发生在哪个文件中,你都可以通过这个管道获得它们:

library(tm)
lilbrary(dplyr)
library(quanteda)

# ..construct the corpus as in your post ...

corpus %>% 
  unlist() %>%  
  tokens() %>%
  tokens_ngrams(2:2, concatenator = " ") %>%  
  unlist() %>%  
  as.data.frame() %>% 
  group_by_(".") %>%  
  summarize(cnt=n()) %>%
  arrange(desc(cnt))