我正在尝试从多个txt文件中创建一个txt文件,然后将此文件分隔为R中的段落,然后在这些段落之间进行相似性比较之后,在这些段落中进行预处理。如果两个段落非常接近,请合并这两段并将其重复到段落末尾。
setwd('F:/My files/specification/files/texts')
file_list <- list.files()
for (file in file_list){
if (!exists("dataset")){
dataset <- scan(file, sep = '\n',
what = list(case = character(), value = character()),
strip.white = TRUE, blank.lines.skip = TRUE)
}
if (exists("dataset")){
temp_dataset <-scan(file, sep = '\n',
what = list(case = character(), value = character()),
strip.white = TRUE, blank.lines.skip = TRUE)
names(dataset) <- names(temp_dataset)
dataset<-rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}
write.table(dataset,"8Files.txt",sep="\t")
x <- readLines("8Files.txt") # read data with readLines
###split text file into paragraphs###
a <- strsplit(x, "\\n\\n")
raw_corpus <- Corpus(VectorSource(a))
raw_corpus<- tm_map(raw_corpus, content_transformer(function(x) iconv(enc2utf8(x), sub = "byte")))
raw_corpus <- tm_map(raw_corpus, tolower)
raw_corpus <- tm_map(raw_corpus, removeNumbers)
raw_corpus <- tm_map(raw_corpus, removePunctuation)
raw_corpus <- tm_map(raw_corpus, function(x) removeWords(x, stopwords("english")))
raw_corpus <- tm_map(raw_corpus, function(x) removeWords(x,"shall"))
raw_corpus <- tm_map(raw_corpus, function(x) removeWords(x,"will"))
raw_corpus <- tm_map(raw_corpus, function(x) removeWords(x,"can"))
raw_corpus <- tm_map(raw_corpus, function(x) removeWords(x,"could"))
raw_corpus <- tm_map(raw_corpus, function(x) removeWords(x,"must"))
raw_corpus <- tm_map(raw_corpus, stemDocument, language = "english")
raw_corpus <- tm_map(raw_corpus, stripWhitespace)
运行tm_map后,我收到以下警告消息:
In tm_map.SimpleCorpus(raw_corpus, tolower) :
transformation drops documents.
在此阶段之后,我也不知道如何比较两个段落中的单词。请帮助我