我正在使用R在文本挖掘中做一个学期论文。我们的任务是猜测文章的基调(正面/负面)。文章存储在各自的文件夹中。我需要创建一个分类系统,通过训练样本来学习。 我重用了http://www.youtube.com/watch?v=j1V2McKbkLo中的代码 除最后一行之外的整个代码都已成功执行。以下是代码。
tone<- c("Positive", "Negative")
folderpath <- "C:/Users/Tanmay/Desktop/R practice/Week8"
options(stringAsFactors = FALSE)
corpus<-Corpus(DirSource(folderpath))
corpuscopy<-corpus
summary(corpus)
inspect(corpus)
#Clean data
CleanCorpus <- function(corpus){
corpustemp <- tm_map(corpus, removeNumbers)
corpustemp <- tm_map(corpus, removePunctuation)
corpustemp <- tm_map(corpus, tolower)
corpustemp <- tm_map(corpus, removeWords, stopwords("english"))
corpustemp <- tm_map(corpus, stemDocument,language="english")
corpustemp <- tm_map(corpus, stripWhitespace)
return(corpustemp )
}
#Document term matrix
generateTDM <- function(tone,path) {
corpusdir <- sprintf("%s/%s",path,tone)
corpus<- Corpus(DirSource( directory=corpusdir ,encoding = "ANSI"))
corpustemp <- CleanCorpus(corpus)
corpusclean <- DocumentTermMatrix(corpustemp)
corpusclean <- removeSparseTerms(corpusclean , 0.7)
result <- list(Tone = tone, tdm = corpusclean)
}
tdm <- lapply(tone,generateTDM,path=folderpath)
#Attach tone
ToneBindTotdm <- function(tdm){
temp.mat <- data.matrix(tdm[["tdm"]])
temp.df <- as.data.frame(temp.mat)
temp.df <- cbind(temp.df,rep(tdm[["Tone"]]),nrow(temp.df))
colnames(temp.df)[ncol(temp.df)] <- "PredictTone"
return(temp.df)
}
Tonetdm <- lapply(tdm,ToneBindTotdm)
#Stack
Stacktdm <- do.call(rbind.fill,Tonetdm)
Stacktdm[is.na(Stacktdm)] <- 0
#Holdout
trainid <- sample(nrow(Stacktdm),ceiling(nrow(Stacktdm) * 0.7))
testid <- (1:nrow(Stacktdm)) [- trainid]
#knn
tdmone <- Stacktdm[,"PredictTone"]
tdmone.nl <- Stacktdm[, !colnames(Stacktdm) %in% "PredictTone"]
knnPredict <- knn(tdmone.nl[trainid,],tdmone.nl[testid,],tdmone[trainid],k=5)
当我尝试执行此操作时,我在最后一行(knn)收到错误:
**Error in knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid], :
NA/NaN/Inf in foreign function call (arg 6)
In addition: Warning messages:
1: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid], :
NAs introduced by coercion
2: In knn(tdmone.nl[trainid, ], tdmone.nl[testid, ], tdmone[trainid], :
NAs introduced by coercion**
任何人都可以帮助我。此外,如果有其他更简单和更好的分类方法,请指出他们。谢谢,抱歉这篇长篇文章。
答案 0 :(得分:1)
我被困在同一个问题上。但我修改它以删除所有NA值。您可以检查我的代码并比较代码中可能存在的问题。
#init
libs <- c("tm" , "plyr" , "class")
lapply(libs,require, character.only=TRUE)
#set options
options(stringsAsFactors = FALSE)
#set parameters
candidates <- c("user1" , "user2" ,"test")
pathname <- "C:/Users/prabhjot.rai/Documents/Project_r/textMining"
#clean text
cleanCorpus <- function(corpus)
{
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
corpus.tmp <- tm_map(corpus.tmp, PlainTextDocument)
}
#build TDM
generateTDM <- function(cand,path)
{
s.dir <- sprintf("%s/%s", path, cand)
s.cor <- Corpus(DirSource(directory = s.dir))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.7)
result <- list(name = cand , tdm = s.tdm)
}
tdm <- lapply(candidates, generateTDM, path = pathname)
test <- t(data.matrix(tdm[[1]]$tdm))
rownames(test) <- c(1:nrow(test))
#attach name and convert to dataframe
makeMatrix <- function(thisTDM){
test <- t(data.matrix(thisTDM$tdm))
rownames(test) <- c(1:nrow(test))
test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
test$candidateName <- thisTDM$name
test <- as.data.frame(test, stringsAsFactors = F , na.rm = T)
}
candTDM <- lapply(tdm, makeMatrix)
# stack all the speeches together
tdm.stack <- do.call(rbind.fill, candTDM)
tdm.stack[is.na(tdm.stack)] <- as.numeric(0)
#testing and training sets
train <- tdm.stack[ tdm.stack$candidateName!= 'test' , ]
train <- train[, names(train) != 'candidateName']
test <- tdm.stack[ tdm.stack$candidateName == 'test' , ]
test <- test[, names(test) != 'candidateName']
classes <- tdm.stack [ tdm.stack$candidateName != 'test' , 'candidateName']
classes <- as.factor(classes)
myknn <- knn(train=train, test = test , cl = classes , k=1)
myknn
将测试文件保存在user1和user2文件夹旁边的test文件夹中,以检查此算法的输出。并将k的值保持为语音数的平方根,最好是奇数。并忽略测试和训练集分配的冗余。它在我的机器中不能在一行中工作,因此在两行中完成。