你好我试图构建一个分类器,它必须识别具有负面情绪(手动标记为-1)和积极情绪(1)的推文。我试图在我的样本数据(约1800条推文)上拟合不同的监督方法模型,训练集由70%的样本组成,测试集由剩余部分组成。问题是获得的结果不令人满意,因为我获得了高测试误差和低AUC值。那问题出在哪里?有没有办法改善这些结果?我的样本可以在这里找到:https://drive.google.com/open?id=0B9DO29WohGN6eHM3UTM1OUdWVnM。下面我逐步报告我的分析,因此它是完全可重复的,主要结果如下:
##reading csv
tweets = read.csv2("Finale2.csv",stringsAsFactors = FALSE)
str(tweets)
tweets$Negative= as.factor(tweets$Sent<=-1)
table(tweets$Negative)
###pre-processing steps
install.packages("tm")
install.packages("SnowballC")
library(tm)
library(SnowballC)
###create a corpus of words and edit it
corpus <- Corpus(VectorSource(tweets$Tweet))
corpus=tm_map(corpus,tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeNumbers)
corpus=tm_map(corpus,removeWords, stopwords("english"))
corpus=tm_map(corpus,removeWords,c("RT","rt","https*"))
corpus=tm_map(corpus,stripWhitespace)
##stemming
corpus <- tm_map(corpus, stemDocument)
##creating a matrix of word frequencies
###creazione matrice frequenza parole
DTM <- DocumentTermMatrix(corpus)
DTM
##remove sparse terms
sparse = removeSparseTerms(DTM, 0.995)
###convert the sparse matrix into a dataframe
tweetsSparse <- as.data.frame(as.matrix(sparse))
names(tweetsSparse)
###make all variables R friendly
colnames(tweetsSparse) <- make.names(colnames(tweetsSparse))
##add indipendent variables
tweetsSparse$Negative <- tweets$Negative
###loading package caTools
install.packages("caTools")
library(caTools)
###splitting training set and test set
set.seed(234)
splitNegative <- sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse <- subset(tweetsSparse, splitNegative == TRUE)
testSparse <- subset(tweetsSparse, splitNegative == FALSE)
require(MASS)
library(MASS)
lda.fit = lda(Negative ~., trainSparse)
lda.fit
###prediction on the test set
lda.pred=predict(lda.fit,testSparse)
table(testSparse$Negative,lda.pred$class)
FALSE TRUE
FALSE 134 119
TRUE 123 176
testerror=(134+119)/nrow(testSparse)
testerror
1 0.4583333
####ROC CURVE
install.packages("ROCR")
library(ROCR)
pred <- prediction(lda.pred$posterior[,2], testSparse$Negative)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
abline(0, 1, lty = 2)
as.numeric(performance(pred, "auc")@y.values)
1 0.598411
install.packages("glmnet")
library(glmnet)
##create the x matrix and the y vector for the training and test set errors (controllare)
x.train = model.matrix(Negative ~ . -1, data = trainSparse)
y.train=trainSparse$Negative
x.test = model.matrix(Negative ~ . -1, data = testSparse)
y.test=testSparse$Negative
##lasso logistic model
sent.lasso = glmnet(x.train, y.train, family = "binomial")
plot(sent.lasso, xvar = "lambda", label = TRUE)
sent.lasso
##select the best lambda with 10-CV:
cv.lasso=cv.glmnet(x.train,y.train,family="binomial")
plot(cv.lasso)
coef(cv.lasso)
###using the best model to predict on the test set
pred.lasso = predict(cv.lasso, x.test, s = cv.lasso$lambda.1se, type = "class")
table(testSparse$Negative, pred.lasso)
pred.lasso
FALSE TRUE
FALSE 40 213
TRUE 18 281
lassoerror=(213+18)/nrow(testSparse)
lassoerror
1 0.4184783
###ROC lasso
prob.lasso = predict(cv.lasso, x.test, s = cv.lasso$lambda.1se, type = "response")
predob = prediction(prob.lasso, testSparse$Negative)
perf = performance(predob, "tpr", "fpr")
par(mfrow = c(1, 1))
plot(perf, main = "LASSO Logistic Regression")
plot(perf, colorize = TRUE)
plot(perf, colorize = TRUE, print.cutoffs.at = seq(0, 1, by = 0.1), text.adj = c(-0.2, 1.7))
abline(0, 1, lty = 2)
as.numeric(performance(predob, "auc")@y.values)
1 0.6266673
install.packages("tree")
library(tree)
sent.tree = tree(Negative ~ ., data = trainSparse)
summary(sent.tree)
plot(sent.tree)
text(sent.tree,pretty=0)
这里我对树的解释有疑问,节点数很差。
set.seed(2)
tree.pred = predict(sent.tree, testSparse, type = "class")
table(testSparse$Negative, tree.pred)
tree.pred
FALSE TRUE
FALSE 14 239
TRUE 5 294
> treerror=(239+5)/nrow(testSparse)
> treerror
1 0.442029
##Roc
tree.pred = predict(sent.tree, testSparse, type = "vector") # predict probabilities
library(ROCR)
pred <- prediction(tree.pred[,2], testSparse$Negative)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
as.numeric(performance(pred, "auc")@y.values)
1 0.5195381
非常糟糕!!
install.packages("randomForest")
library(randomForest)
set.seed(345)
sent.rf = randomForest(Negative ~ ., data = trainSparse)
sent.rf
##plot the error rate
plot(sent.rf$err.rate[,1])
###variable importance plot
varImpPlot(sent.rf)
###preformance on the test set
rf.pred = predict(sent.rf, testSparse, type = "class")
table(testSparse$Negative, rf.pred)
rf.pred
FALSE TRUE
FALSE 162 91
TRUE 119 180
rferror=(119+91)/nrow(testSparse)
rferror
1 0.3804348 在这种情况下,我无法计算ROC曲线,所以如果有人知道如何操作,请提供代码。
###Support vector Classifier
install.packages("e1071")
library(e1071)
svmfit = svm(as.factor(Negative) ~ ., data = trainSparse, kernel = "linear", cost = 1)
names(svmfit)
summary(svmfit)
svmfit$index
##selecting best SVC with 10 CV
set.seed(1)
tune.out=tune(svm ,Negative~.,data=trainSparse, kernel ="linear",
ranges=list(cost=c(0.1,1,10,100,1000),probability = TRUE))
在这种情况下,我有一个警告:达到最大迭代次数。如果有人知道如何解决这个问题,请帮助我。多项式内核也会出现同样的问题。
tune.out=tune(svm ,Negative~.,data=trainSparse, kernel ="radial",
ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4),
probability = TRUE))
summary(tune.out)
best=tune.out$best.model
###prediction on the test set
svm.pred = predict(best, testSparse, type = "class")
table(testSparse$Negative,svm.pred)
svm.pred
FALSE TRUE
FALSE 30 223
TRUE 5 294
radialerror=(223+5)/nrow(testSparse)
radialerror
1 0.4130435
###Roc curve
yhat.opt = predict(best,testSparse,probability = TRUE)
pred <- prediction(attributes(yhat.opt)$probabilities[,2], testSparse$Negative)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
as.numeric(performance(pred, "auc")@y.values)
1 0.5908827
嗯,希望这对某人来说可能是一个有用的例子而不会被阻止。如果有人可以帮助我改善这些结果或解决其中一个问题,请在下面评论。