Question

我正在尝试创建语料库，但是我想在文档中组合2个连续的单词，我不想要单个单词的语料库。

我正在使用下面的脚本。有没有办法可以创建语料库“docs”，在每个文档中包含2个连续的单词？请指教。

library(plyr)
library(tm)
library(e1071)

setwd("C:/Assignment/Assignment-Group-Prediction/IPM")

training<- read.csv("Data.csv",header=T,na.strings=c(""))

Res_Desc_Train <- subset(training,select=c("Group","Description"))

##Step 1 : Create Document Matrix 

docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))

#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, ";")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\(" )
docs <- tm_map(docs, toSpace, ")")
docs <- tm_map(docs, toSpace, ",")
docs <- tm_map(docs, toSpace, "_")
docs <- tm_map(docs, content_transformer(removeSpecialChars))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)

Answer 1

tm包的FAQ会直接回答您的问题：

我可以在术语 - 文档矩阵中使用bigrams而不是单个令牌吗？

是。包NLP提供计算n-gram的功能，可用于构造相应的标记化器。 E.g：

library("tm")
data("crude")

BigramTokenizer <-
function(x)
  unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

tdm <- TermDocumentMatrix(crude, control = list(tokenize = BigramTokenizer))
inspect(removeSparseTerms(tdm[, 1:10], 0.7))

通过组合r中的单词来创建语料库

1 个答案: