几个月前我在R中使用tm和其他软件包运行了一些基本的文本处理。我尝试从语料库中调用特定文档,在使用tm_map进行任何清理之前,该文档工作正常。但是,在运行一个函数来清理文本之后,它会尝试显示所有文档的文本并使我的整个R系统陷入困境。有什么建议? tm的某些基本方面是否已更改为此语法不再适用的位置?以下示例数据和语法:
#data#
file = as.data.frame(matrix( c('case1', 'this is some SAMPLE TEXT!','case2', 'and this is the 2nd version of that text...'),
nrow=2,
ncol=2,
byrow = TRUE), stringsAsFactors=FALSE)
names(file) [1] <- 'doc_id'
names(file) [2] <- 'text'
print (file)
library(tm)
wordCorpus <- Corpus(DataframeSource(file))
wordCorpus[[1]][1] #as expected, shows one document#
wordCorpus[[1]]$content #as expected, shows one document#
wordCorpus[[2]][1] #as expected, shows one document#
wordCorpus[[2]]$content #as expected, shows one document#
str(wordCorpus)
#clean the text#
custom_stopwords <- c( "the") #creating custom stopword list#
word_fixer <- function(x) {
x <-
gsub("every day", "everyday",x)
return(x)
}
word_fixer ("I love it every day... oops meant everyday")
#Cleaning function#
clean_corpus <- function (wordCorpus) {
wordCorpus <- tm_map(wordCorpus, removeNumbers)
wordCorpus <- tm_map(wordCorpus, content_transformer(tolower))
wordCorpus <- tm_map(wordCorpus, removePunctuation)
wordCorpus <- tm_map(wordCorpus, content_transformer (word_fixer))
wordCorpus <- tm_map(wordCorpus, removeWords, custom_stopwords)
wordCorpus <- tm_map(wordCorpus, stripWhitespace)
wordCorpus <- tm_map(wordCorpus, PlainTextDocument)
return (wordCorpus)
}
#Putting cleaner to work#
wordCorpus <- clean_corpus (wordCorpus)
wordCorpus[[1]][1] #now it shows both documents#
wordCorpus[[2]][1]