我尝试使用word2vec和逻辑多项式回归创建情感分析系统。
为此,我试着像作者在这里做的那样:http://analyzecore.com/2017/02/08/twitter-sentiment-analysis-doc2vec/
这里是R代码:
library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)
Train_classifier <- read.csv('IRC.csv',header=T, sep=";")
Test_classifier <- read.csv('IRC2.csv',header=T, sep=";")
# select only 4 column of the dataframe
Train <- Train_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
Test <- Test_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
#delete rows with empty value columns
subTrain <- Train[rowSums(Train == '') == 0,]
subTrain$ID <- seq.int(nrow(subTrain))
subTrain
DATE_SAISIE idpart ID Raison.Reco Note.Reco
2 19/03/2014 102853645 1 Good 0
3 19/03/2014 1072309 2 Not good 2
4 19/03/2014 191391 3 very good 9
6 19/03/2014 14529 4 not comment 8
7 19/03/2014 100065501 5 very professional 9
8 19/03/2014 102261392 6 very good 1
9 19/03/2014 102734704 7 good 10
10 19/03/2014 1004397 8 not very good 10
# # replacing class values
subTrain$Note.Reco = ifelse(subTrain$Note.Reco >= 0 & subTrain$Note.Reco <= 4, 0, ifelse(subTrain$Note.Reco >= 5 &
subTrain$Note.Reco <= 6, 1, ifelse(subTrain$Note.Reco >= 7 & subTrain$Note.Reco <= 8, 2, 3)))
subTest <- Test[rowSums(Test == '') == 0,]
subTest$ID <- seq.int(nrow(subTest))
#Data pre processing
#Doc2Vec
prep_fun <- tolower
tok_fun <- word_tokenizer
subTrain[] <- lapply(subTrain, as.character)
it_train <- itoken(subTrain$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTrain$ID,
progressbar = TRUE)
subTest[] <- lapply(subTest, as.character)
it_test <- itoken(subTest$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTest$ID,
progressbar = TRUE)
#creation of vocabulary and term document matrix
### fichier d'apprentissage
vocab_train <- create_vocabulary(it_train)
vectorizer_train <- vocab_vectorizer(vocab_train)
dtm_train <- create_dtm(it_train, vectorizer_train)
### test data
vocab_test <- create_vocabulary(it_test)
vectorizer_test <- vocab_vectorizer(vocab_test)
dtm_test <- create_dtm(it_test, vectorizer_test)
##Define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
y = subTrain[['Note.Reco']], family = 'multinomial',type.multinomial = "grouped")
当我运行此代码时,我收到此错误:
> glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf, + y = subTrain[['Note.Reco']], family = 'multinomial',type.multinomial = "grouped") Error in if (!all(o)) { : missing value where TRUE/FALSE needed
请问好吗?
谢谢
编辑:
问题来自这条线路@Edward Moseley在他的评论中说:
subTrain[] <- lapply(subTrain, as.character)
但是当我删除它并移动到这一行时:
it_train <- itoken(subTrain$Raison.Reco,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = subTrain$ID,
progressbar = TRUE)
我收到此错误:
Error in UseMethod("itoken") :
no applicable method for 'itoken' applied to an object of class "factor"
subTrain
> subTrain
Note.Reco
1 3
2 3
3 2
4 3
5 3
6 1
7 3
8 1
9 2
10 3
11 3
12 3
13 3
14 2
15 2
16 3
17 3
18 2
19 3
20 2
21 2
22 2
23 0
24 0
25 2
26 3
27 3
28 0
29 0
30 2
31 3
32 3
33 3
34 3
35 0
36 1
37 2
38 1
39 3
40 3
41 3
42 1
43 3
44 2
45 3
46 3
47 2
48 3
49 3
50 2
51 1
52 1
53 2
54 3
55 3
56 2
57 2
58 3
59 2
60 1
61 3
62 0
63 2
64 2
65 3
66 0
67 1
68 3
69 2
70 2
71 3
72 2
73 2
74 2
75 3
76 2
77 2
78 3
79 3
80 3
81 3
82 2
83 2
84 1
85 0
86 2
87 0
88 3
89 3
90 3
91 2
92 1
93 2
94 1
95 3
96 3
97 2
98 2
99 3
100 3
101 0
102 2
103 2
104 0
105 2
106 3
107 2
108 2
109 2
110 3
111 3
112 2
113 2
114 2
115 3
116 3
117 2
118 3
119 3
120 3
121 3
122 2
123 3
124 2
125 2
126 0
127 3
128 3
129 0
130 3
131 0
132 1
133 3
134 2
135 0
136 1
137 3
138 1
139 3
140 3
141 3
142 2
143 2
144 3
145 2
146 2
147 3
148 1
149 1
150 3
151 2
152 2
153 3
154 2
155 3
156 2
157 3
158 3
159 3
160 0
161 2
162 1
163 3
164 3
165 1
166 2
167 2
168 3
169 2
170 3
171 3
172 3
173 2
174 2
175 3
176 3
177 0
178 3
179 2
180 3
181 0
182 3
183 3
184 2
185 3
186 3
187 1
188 3
189 1
190 2
191 2
192 3
193 3
194 3
195 2
196 2
197 3
198 2
199 0
200 3
... <truncated>
1
2 ... <truncated>
3 ... <truncated>
4 ... <truncated>
5
6 ... <truncated>
7
我看到了这个&#34;截断值&#34;在数据框中,这是正常的吗?
答案 0 :(得分:1)
我认为你在lapply()
时会引入问题。
尝试检查:(dim(subTrain[['Note.Reco']]) > 0)
如果评估为TRUE
,那么您可能会遇到与我在下面描述的问题不同的问题。
从glmnet github我看到:if (!all(o)) {
作为glmnet/R/lognet.R
的一部分,(我假设它由cv.glmnet
调用)。在我们看到的lognet函数中:
27 weights=drop(y%*%rep(1,nc))
28 o=weights>0
29 if(!all(o)){ #subset the data
早些时候,在第2行,我们看到nc
已定义:
2 nc=dim(y)
其余代码取决于nc
的值,因此我建议为初学者确定dim(subTrain[['Note.Reco']])
的结果。您可以尝试以不同方式访问这些数据:
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf, y = subTrain$Note.Reco, family = 'multinomial',type.multinomial = "grouped")
另外,如果你构造一个data.frame
我们可以复制/粘贴它来使用它会更容易调试。
答案 1 :(得分:0)
使用glmnet和text2vec时我也遇到过这种问题。在我的情况下,问题是由失踪案件引起的。原因是all()
调用的函数cv.glment
不适用于缺失值。为解决此问题,我从数据中删除了NAs
。