我是R中的一个完整的初学者,这是我第一次在stackoverflow上发帖。请温柔:)
我尝试通过以下教程和实际例子来学习R,但是却陷入了困境,并且不知道自己做错了什么。
我尝试按照发布here的说明进行操作。但是当我尝试训练模型时,请收到以下错误消息:
Error in na.fail.default(list(doc.class = c(3L, 1L, 1L, 1L, 1L, 1L, 1L, :
missing values in object
我希望有人能帮我理解这里发生了什么?我检查了tdmTrain,它只包含NA
个值。我不知道为什么以及如何解决它。
这是我收到错误消息的步骤的代码。
library(NLP)
library(tm)
library(caret)
r8train <- read.table("r8-train-all-terms.txt", header=FALSE, sep='\t')
r8test <- read.table("r8-test-all-terms.txt", header=FALSE, sep='\t')
# rename variables
names(r8train) <- c("Class", "docText")
names(r8test) <- c("Class", "docText")
# convert the document text variable to character type
r8train$docText <- as.character(r8train$docText)
r8test$docText <- as.character(r8test$docText)
# create varible to denote if observation is train or test
r8train$train_test <- c("train")
r8test$train_test <- c("test")
# merge the train/test data
merged <- rbind(r8train, r8test)
# remove objects that are no longer needed
remove(r8train, r8test)
merged <- merged[which(merged$Class %in% c("crude","money-fx","trade")),]
# drop unused levels in the response variable
merged$Class <- droplevels(merged$Class)
# counts of each class in the train/test sets
table(merged$Class,merged$train_test)
# a vector source interprets each element of the vector as a document
sourceData <- VectorSource(merged$docText)
# create the corpus
corpus <- Corpus(sourceData)
# preprocess/clean the training corpus
corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lowercase
corpus <- tm_map(corpus, removeNumbers) # remove digits
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, stripWhitespace) # strip extra whitespace
corpus <- tm_map(corpus, removeWords, stopwords('english')) # remove stopwords
# create term document matrix (tdm)
tdm <- DocumentTermMatrix(corpus)
as.matrix(tdm)[10:20,200:210] # inspect a portion of the tdm
# create tf-idf weighted version of term document matrix
weightedtdm <- weightTfIdf(tdm)
as.matrix(weightedtdm)[10:20,200:210] # inspect same portion of the weighted tdm
# find frequent terms: terms that appear in at least "250" documents here, about 25% of the docs
findFreqTerms(tdm, 250)
# convert tdm's into data frames
tdm <- as.data.frame(inspect(tdm))
weightedtdm <- as.data.frame(inspect(weightedtdm))
# split back into train and test sets
tdmTrain <- tdm[which(merged$train_test == "train"),]
weightedTDMtrain <- weightedtdm[which(merged$train_test == "train"),]
tdmTest <- tdm[which(merged$train_test == "test"),]
weightedTDMtest <- weightedtdm[which(merged$train_test == "test"),]
# remove objects that are no longer needed to conserve memory
remove(tdm,weightedtdm)
# append document labels as last column
tdmTrain$doc.class <- merged$Class[which(merged$train_test == "train")]
tdmTest$doc.class <- merged$Class[which(merged$train_test == "test")]
weightedTDMtrain$doc.class <- merged$Class[which(merged$train_test == "train")]
weightedTDMtest$doc.class <- merged$Class[which(merged$train_test == "test")]
# set resampling scheme
ctrl <- trainControl(method="repeatedcv",number = 10, repeats = 3) #,classProbs=TRUE)
# fit a kNN model using the weighted (td-idf) term document matrix
# tuning parameter: K
set.seed(100)
knn.tfidf <- train(doc.class ~ ., data = weightedTDMtrain, method = "knn", trControl = ctrl) #, tuneLength = 20)