当我在第二个文档中添加更多“饮食”时,weighting = weightTfIdf
参数应降低“饮食”的权重,但实际上并非如此。 varImp(fit)
总是一样吗?因此,请以多个“吃”和一个“吃”运行代码,并查看train
和varImp(fit)
的结果。
希望您能提供帮助,在此先感谢您。
library(caret)
library(tm)
### Training data.
data <- c('Cats like to chase mice.', 'Dogs like to eat eat eat eat eat eat big bones.')
corpus <- VCorpus(VectorSource(data))
corpus <- tm_map(corpus , removePunctuation)
corpus <- tm_map(corpus , removeNumbers)
corpus <- tm_map(corpus , content_transformer(tolower))
corpus <- tm_map(corpus , removeWords, stopwords("en"))
corpus <- tm_map(corpus , stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
dtm <- tm::DocumentTermMatrix(corpus,
control = list(weighting=weightTfIdf))
train <- as.matrix(dtm); train
train <- cbind(train, c(0, 1))
colnames(train)[ncol(train)] <- 'y'
train <- as.data.frame(train)
train$y <- as.factor(train$y)
### Train.
fit <- train(y ~ ., data = train, method = 'bayesglm')
varImp(fit)
### Check accuracy on training.
predict(fit, newdata = train)
### Test data.
data2 <- c('Bats eat bugs.')
corpus <- VCorpus(VectorSource(data2))
dtm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(dtm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test <- as.matrix(dtm)
### Check accuracy on test.
predict(fit, newdata = test)