比较两个文档中的单词包,并在第二个文档中找到匹配的单词及其频率

时间:2017-08-21 06:47:33

标签: r text-mining tm qdap sentimentr

我计算了'yelp.csv','yelpp.csv','yelpn.csv'的单词包,并创建了个人数据集的单词频率矩阵。现在,我想比较yelp的单词和yelpn,并检查yelp中出现的yelp单词及其频率,并将其作为矩阵存储在变量中,然后对yelpp进行相同的处理。 yelp包含正面和负面。 yelpp,只有正面和yelpn,只有负面。任何人都可以完成代码吗?我不知道这段代码是否相关,我希望如此。

getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t

#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp

#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors=   TRUE,
           strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn



#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat

#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn

1 个答案:

答案 0 :(得分:2)

我正在从wiki Text mining获取文本,例如语料库。使用tm包和findFreqTermsagrep函数是此方法的主要内容。

agrep

  

使用广义Levenshtein编辑距离(将一个字符串转换为需要的插入,删除和替换的最小可能加权数),搜索字符串x(第二个参数)的每个元素内的模式(第一个参数)的近似匹配另一个)。

接近步骤:

文本 - &gt;语料库 - &gt;数据清理 - &gt; findfreqterms - &gt;与其他术语doc矩阵比较

library(tm)

c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))

c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))

c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))

# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))

c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)

c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)

c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)

dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))

ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)

#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
  find <- agrep(t, ft2)
  if(length(find) != 0){
    common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
  }
}
# Note : this for loop can be substituted by apply family functions if taking time for large text

common.c1c2包含corpus1和corpus2之间的常用词,频率为

> common.c1c2
      term freq
1     also    1
2     data    2
3  derived    1
4 deriving    1
5   mining    1
6  pattern    1
7 patterns    1
8  process    1
9     text    1

> ft1
 [1] "also"        "analytics"   "data"        "derived"     "deriving"    "devising"    "equivalent" 
 [8] "highquality" "information" "learning"    "means"       "mining"      "pattern"     "patterns"   
[15] "process"     "referred"    "roughly"     "statistical" "text"        "trends"      "typically"  

> ft2
 [1] "addition"       "along"          "data"           "database"       "derived"        "deriving"      
 [7] "evaluation"     "features"       "finally"        "input"          "insertion"      "interpretation"
[13] "involves"       "linguistic"     "mining"         "others"         "output"         "parsing"       
[19] "patterns"       "process"        "removal"        "structured"     "structuring"    "subsequent"    
[25] "text"           "usually"        "within"        

这个解决方案不是最有效的解决方案,但希望它有所帮助。