我正在尝试使用R中的tm
包从语料库中删除重复项。例如,to
删除&符号,我使用以下R语句:
removeAmp <- function(x) gsub("&\;", "", x)
myCorpus <- tm_map(myCorpus, removeAmp)
然后我尝试使用以下内容删除重复项:
removeDup <- function(x) unique(x)
myCorpus <- tm_map(myCorpus, removeDup)
我收到错误消息:
match.fun(FUN)出错:缺少参数“FUN”,没有默认值
我也试过
removeDup <- function(x) as.list(unique(unlist(x)))
但仍然出错。非常感谢任何帮助。
答案 0 :(得分:1)
可以使用以下代码删除重复的条目。
首先,将先前清理的语料库转换回数据框。
df.tweets<-data.frame(text=unlist(sapply(tweet.corpus, `[`,"content")), stringsAsFactors=F)
其次,删除数据框中的重复条目
tweets.out.unique <- unique(df.tweets)
第三,将其转换回语料库(如果需要)(假设数据框有一个列)
tweet.corpus.clean <- Corpus(DataframeSource(tweets.out.unique[1]))
我不知道这是否更优雅,但很容易!
答案 1 :(得分:0)
这对我有用:
clean.corpus <- function(corpus) {
#remove "mc.cores=1" for windows! (Only necessary for Macintosh)
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
myStopwords <- c(stopwords(use.stopwords), "twitter", "tweets","tweet","tweeting", "retweet", "followme", "account", "available", "via")
myStopwords <- c(myStopwords, "melinafollowme", "voten", "samier", "zsm", "hpa", "geraus", "vote", "gevotet", "dagibee", "berlin")
myStopwords <- c(myStopwords, "mal","dass", "für", "votesami", "votedagi", "vorhersage", "\u2728\u2728\u2728\u2728\u2728", "\u2728\u2728\u2728")
cleaned.corpus <- tm_map(corpus, stripWhitespace, lazy=TRUE)
cleaned.corpus <- tm_map(cleaned.corpus, content_transformer(tolower), mc.cores=1)
cleaned.corpus <- tm_map(cleaned.corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), lazy=TRUE)
cleaned.corpus <- tm_map(cleaned.corpus, removePunctuation, lazy=TRUE)
cleaned.corpus <- tm_map(cleaned.corpus, removeNumbers, lazy=TRUE)
cleaned.corpus <- tm_map(cleaned.corpus, removeURL)
cleaned.corpus <- tm_map(cleaned.corpus, function(x) removeWords(x, myStopwords), mc.cores=1);
cleaned.corpus <- tm_map(cleaned.corpus,
function(x)removeWords(x,stopwords(use.stopwords)), mc.cores=1)
removeDup <- function(x) unique(x)
cleaned.corpus <- tm_map(cleaned.corpus, removeDup, mc.cores=1)
cleaned.corpus <- tm_map(cleaned.corpus, PlainTextDocument)
return (cleaned.corpus)
}