使用R的多类逻辑回归预测情绪分数

时间:2017-03-13 17:49:32

标签: regression linear-regression word2vec sentiment-analysis

enter image description here我正在尝试使用R(glmnet)的逻辑回归创建一个情绪分析分类器。这是R代码:

 library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)
Train_classifier <- read.csv('IRC.csv',header=T, sep=";")
Test_classifier <- read.csv('IRC2.csv',header=T, sep=";")

# select only 4 column of the dataframe

Train <- Train_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
Test <- Test_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]

#delete rows with empty value columns
subTrain <- filter(Train, trimws(Raison.Reco)!=" ")
subTrain$ID <- seq.int(nrow(subTrain))

# # replacing class values
subTrain$Note.Reco = ifelse(subTrain$Note.Reco >= 0 & subTrain$Note.Reco <= 4, 0, ifelse(subTrain$Note.Reco >= 5 &
subTrain$Note.Reco <= 6, 1, ifelse(subTrain$Note.Reco >= 7 & subTrain$Note.Reco <= 8, 2, 3)))


subTest <- filter(Test, trimws(Raison.Reco)!=" ")
subTest$ID <- seq.int(nrow(subTest))

#Data pre processing
#Doc2Vec

prep_fun <- tolower
tok_fun <- word_tokenizer

subTrain[] <- lapply(subTrain, as.character)
it_train <- itoken(subTrain$Raison.Reco, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = subTrain$ID,
                   progressbar = TRUE)



subTest[] <- lapply(subTest, as.character)
it_test <- itoken(subTest$Raison.Reco, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = subTest$ID,
                   progressbar = TRUE)


#creation of vocabulairy and term document matrix
  ### fichier  d'apprentissage
vocab_train <- create_vocabulary(it_train)
vectorizer_train <- vocab_vectorizer(vocab_train)
dtm_train <- create_dtm(it_train, vectorizer)


  ###  test data



vocab_test <- create_vocabulary(it_test)
vectorizer_test <- vocab_vectorizer(vocab_test)
dtm_test <- create_dtm(it_test, vectorizer_test)

##Define  tf-idf model 

tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)

glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
                               y = subTrain[['Note.Reco']], 
                               family = 'multinomial', 
                               # L1 penalty
                               alpha = 1,
                               # interested in the area under ROC curve
                               type.measure = "auc",
                               # 5-fold cross-validation
                               nfolds = 5,
                               # high value is less accurate, but has faster training
                               thresh = 1e-3,
                               # again lower number of iterations for faster training
                               maxit = 1e3)


plot(glmnet_classifier)

这是数据subTrain的结构:

[![Note.Reco Raison.Reco  DATE_SAISIE idpart ID

    3 Good service 19/03/2014 56992
    2 good stuff 19/03/2014 53645
    8 very nice 20/02/2016 261392
    ...][1]][1]

我得到这个情节(附件)如果是真的你可以解释一下吗谢谢

0 个答案:

没有答案
相关问题