# Loading required libraries
# Set up logistics such as reading in data and setting up corpus
```{r}
# Relative path points to the local folder
folder.path="../data/InauguralSpeeches/"
# get the list of file names
speeches=list.files(path = folder.path, pattern = "*.txt")
# Truncate file names so it is only showing "FirstLast-Term"
prez.out=substr(speeches, 6, nchar(speeches)-4)
# Create a vector NA's equal to the length of the number of speeches
length.speeches=rep(NA, length(speeches))
# Create a corpus
ff.all<-Corpus(DirSource(folder.path))
```
# Clean the data
```{r}
# Use tm_map to strip all white spaces to a single space, to lower case case, remove stop words, empty strings and punctuation.
ff.all<-tm_map(ff.all, stripWhitespace)
ff.all<-tm_map(ff.all, content_transformer(tolower))
ff.all<-tm_map(ff.all, removeWords, stopwords("english"))
ff.all<-tm_map(ff.all, removeWords, c("can", "may", "upon", "shall", "will", "must", ""))
ff.all&lt; -tm_map(ff.all,gsub,pattern =&#34; free&#34;,replacement =&#34; freedom&#34;)
ff.all<-tm_map(ff.all, removeWords, character(0))
ff.all<-tm_map(ff.all, removePunctuation)
# tdm.all = a Term Document Matrix
tdm.all<-TermDocumentMatrix(ff.all)
所以我试图用一个词根替换相似的词。例如,替换&#34; free&#34;通过&#34;自由&#34;在文本挖掘项目中。
然后我从Youtube教程中学到了这一行:ff.all&lt; -tm_map(ff.all,gsub,pattern =&#34; free&#34 ;, replacement =&#34; freedom&#34;)。 没有这一行,代码就会运行。
添加此行后,R Studio会出现此错误&#34; 错误:继承(doc,&#34; TextDocument&#34;)不为TRUE &#34;关于这一行的执行:&#34; tdm.all&lt; -TermDocumentMatrix(ff.all)&#34;
我认为这应该是一个相对简单的问题,但我无法在stackoverflow上找到解决方案。
答案 0 :(得分:1)
使用tm
的内置crude
数据,我可以通过将gsub
包裹在content_transformer
这样的电话中来解决您的问题。
ff.all<-tm_map(ff.all, content_transformer(function(x) gsub(x, pattern = "free", replacement = "freedom")))
根据我的经验,tm_map
对自定义函数返回的对象做了很多事情。因此,虽然您的原始行有效tm_map
并未真正返回真正的&#34;语料库&#34;这就是导致错误的原因。
作为旁注:
这条线似乎无能为力 ff.all&lt; -tm_map(ff.all,removeWords,character(0))
与""
相同
ff.all&lt; -tm_map(ff.all,removeWords,c(&#34; can&#34;,&#34; may&#34;,&#34; on&#34;,&#34; will&#34; ,&#34;将&#34;,&#34;必须&#34;,&#34;&#34;))
library(tm)
data(crude)
ff.all <- crude
ff.all<-tm_map(ff.all, stripWhitespace)
ff.all<-tm_map(ff.all, content_transformer(tolower))
ff.all<-tm_map(ff.all, removeWords, stopwords("english"))
ff.all<-tm_map(ff.all, removeWords, c("can", "may", "upon", "shall", "will", "must", ""))
ff.all<-tm_map(ff.all, content_transformer(function(x) gsub(x, pattern = "free", replacement = "freedom")))
ff.all<-tm_map(ff.all, removeWords, character(0))
ff.all<-tm_map(ff.all, removePunctuation)
# tdm.all = a Term Document Matrix
tdm.all<-TermDocumentMatrix(ff.all)