Question

我尝试使用word2vec和逻辑多项式回归创建情感分析系统。

为此，我试着像作者在这里做的那样：http://analyzecore.com/2017/02/08/twitter-sentiment-analysis-doc2vec/

这里是R代码：

library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)

Train_classifier <- read.csv('IRC.csv',header=T, sep=";")
Test_classifier <- read.csv('IRC2.csv',header=T, sep=";")

# select only 4 column of the dataframe

Train <- Train_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]
Test <- Test_classifier[, c("Note.Reco", "Raison.Reco", "DATE_SAISIE", "idpart")]

#delete rows with empty value columns
subTrain <- Train[rowSums(Train == '') == 0,]
subTrain$ID <- seq.int(nrow(subTrain))

 subTrain
      DATE_SAISIE    idpart    ID  Raison.Reco  Note.Reco
2      19/03/2014 102853645     1  Good         0
3      19/03/2014   1072309     2  Not good     2
4      19/03/2014    191391     3  very good    9
6      19/03/2014     14529     4  not comment  8
7      19/03/2014 100065501     5  very professional  9
8      19/03/2014 102261392     6  very good  1
9      19/03/2014 102734704     7  good  10
10     19/03/2014   1004397     8  not very good  10

# # replacing class values
subTrain$Note.Reco = ifelse(subTrain$Note.Reco >= 0 & subTrain$Note.Reco <= 4, 0, ifelse(subTrain$Note.Reco >= 5 &
subTrain$Note.Reco <= 6, 1, ifelse(subTrain$Note.Reco >= 7 & subTrain$Note.Reco <= 8, 2, 3)))


subTest <- Test[rowSums(Test == '') == 0,]
subTest$ID <- seq.int(nrow(subTest))

#Data pre processing
#Doc2Vec

prep_fun <- tolower
tok_fun <- word_tokenizer

subTrain[] <- lapply(subTrain, as.character)
it_train <- itoken(subTrain$Raison.Reco, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = subTrain$ID,
                   progressbar = TRUE)



subTest[] <- lapply(subTest, as.character)
it_test <- itoken(subTest$Raison.Reco, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = subTest$ID,
                   progressbar = TRUE)


#creation of vocabulary and term document matrix
  ### fichier  d'apprentissage
vocab_train <- create_vocabulary(it_train)
vectorizer_train <- vocab_vectorizer(vocab_train)
dtm_train <- create_dtm(it_train, vectorizer_train)


  ###  test data



vocab_test <- create_vocabulary(it_test)
vectorizer_test <- vocab_vectorizer(vocab_test)
dtm_test <- create_dtm(it_test, vectorizer_test)

##Define  tf-idf model 

tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)

glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
                               y = subTrain[['Note.Reco']], family = 'multinomial',type.multinomial = "grouped")

当我运行此代码时，我收到此错误：

> glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
+                                y = subTrain[['Note.Reco']], family = 'multinomial',type.multinomial = "grouped")
Error in if (!all(o)) { : missing value where TRUE/FALSE needed

请问好吗？

谢谢

编辑：

问题来自这条线路@Edward Moseley在他的评论中说：

subTrain[] <- lapply(subTrain, as.character)

但是当我删除它并移动到这一行时：

it_train <- itoken(subTrain$Raison.Reco, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = subTrain$ID,
                   progressbar = TRUE)

我收到此错误：

Error in UseMethod("itoken") : 
  no applicable method for 'itoken' applied to an object of class "factor"




subTrain
> subTrain
      Note.Reco
1             3
2             3
3             2
4             3
5             3
6             1
7             3
8             1
9             2
10            3
11            3
12            3
13            3
14            2
15            2
16            3
17            3
18            2
19            3
20            2
21            2
22            2
23            0
24            0
25            2
26            3
27            3
28            0
29            0
30            2
31            3
32            3
33            3
34            3
35            0
36            1
37            2
38            1
39            3
40            3
41            3
42            1
43            3
44            2
45            3
46            3
47            2
48            3
49            3
50            2
51            1
52            1
53            2
54            3
55            3
56            2
57            2
58            3
59            2
60            1
61            3
62            0
63            2
64            2
65            3
66            0
67            1
68            3
69            2
70            2
71            3
72            2
73            2
74            2
75            3
76            2
77            2
78            3
79            3
80            3
81            3
82            2
83            2
84            1
85            0
86            2
87            0
88            3
89            3
90            3
91            2
92            1
93            2
94            1
95            3
96            3
97            2
98            2
99            3
100           3
101           0
102           2
103           2
104           0
105           2
106           3
107           2
108           2
109           2
110           3
111           3
112           2
113           2
114           2
115           3
116           3
117           2
118           3
119           3
120           3
121           3
122           2
123           3
124           2
125           2
126           0
127           3
128           3
129           0
130           3
131           0
132           1
133           3
134           2
135           0
136           1
137           3
138           1
139           3
140           3
141           3
142           2
143           2
144           3
145           2
146           2
147           3
148           1
149           1
150           3
151           2
152           2
153           3
154           2
155           3
156           2
157           3
158           3
159           3
160           0
161           2
162           1
163           3
164           3
165           1
166           2
167           2
168           3
169           2
170           3
171           3
172           3
173           2
174           2
175           3
176           3
177           0
178           3
179           2
180           3
181           0
182           3
183           3
184           2
185           3
186           3
187           1
188           3
189           1
190           2
191           2
192           3
193           3
194           3
195           2
196           2
197           3
198           2
199           0
200           3
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        ... <truncated>
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ... <truncated>
3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ... <truncated>
4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ... <truncated>
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ... <truncated>
7

我看到了这个＆＃34;截断值＆＃34;在数据框中，这是正常的吗？

Answer 1

我认为你在lapply()时会引入问题。

尝试检查：(dim(subTrain[['Note.Reco']]) > 0)

如果评估为TRUE，那么您可能会遇到与我在下面描述的问题不同的问题。

从glmnet github我看到：if (!all(o)) {作为glmnet/R/lognet.R的一部分，（我假设它由cv.glmnet调用）。在我们看到的lognet函数中：

27   weights=drop(y%*%rep(1,nc)) 
28   o=weights>0 
29   if(!all(o)){ #subset the data

早些时候，在第2行，我们看到nc已定义：

2   nc=dim(y)

其余代码取决于nc的值，因此我建议为初学者确定dim(subTrain[['Note.Reco']])的结果。您可以尝试以不同方式访问这些数据：

glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf, y = subTrain$Note.Reco, family = 'multinomial',type.multinomial = "grouped")

另外，如果你构造一个data.frame我们可以复制/粘贴它来使用它会更容易调试。

Answer 2

使用glmnet和text2vec时我也遇到过这种问题。在我的情况下，问题是由失踪案件引起的。原因是all()调用的函数cv.glment不适用于缺失值。为解决此问题，我从数据中删除了NAs。

if（！all（o））{：缺少值为TRUE / FALSE需要R时出错

2 个答案: