所以我使用text2vec R包来构建用于特征选择的单词矢量化。我根据Dmitriy Selivanov的页面http://text2vec.org/vectorization.html做了那个,它解释了如何在构建分类器之前正确使用text2vec


# Loading packages and movie review data

# Converting list of movie reviews to data table by reference

# Sorting the data table by ID
setkey(movie_review, id)

#Set seed for reproducibe results

# Splitting data set into training and testing data 
all_ids = movie_review$id
train_ids = sample(all_ids, 4000)
test_ids = setdiff(all_ids, train_ids)
train = movie_review[J(train_ids)]
test = movie_review[J(test_ids)]

# Vocabulary-based vecorization 

# Define preprocessing function and tokenization function
# Setting up a pre-processing function
prep_fun <- function(x) {
x %>% 
# make text lower case
str_to_lower %>% 
# remove non-alphanumeric symbols
str_replace_all("[^[:alpha:]]", " ") %>% 
# collapse multiple spaces
str_replace_all("\\s+", " ")}

# Setting up the tokenization function
tok_fun = word_tokenizer

# Using an iterator over tokens to create the vocabulary
it_train = itoken(train$review, 
              preprocessor = prep_fun, 
              tokenizer = tok_fun, 
              ids = train$id, 
              progressbar = FALSE)
vocab = create_vocabulary(it_train)

# The created vocabulary consists of 35070 unique words.
# Now that the vocabulary is set up, it is time to construct a DTM.
# The vocab_vectorizer() function will create the vectors of the tokens.
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))

# Vectorization and DTM creation of the training data is now complete.
# Looking up the dimensions of the created DTM
identical(rownames(dtm_train), train$id)

然后我继续使用glmnet R包来拟合LASSO回归模型。

# Training the model using LASSO regression to avoid high variance in coefficients
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['sentiment']], 
                          family = 'binomial', 
                          # L1 penalty
                          alpha = 1,
                          # interested in the area under ROC curve
                          type.measure = "auc",
                          # 5-fold cross-validation
                          nfolds = NFOLDS,
                          # high value is less accurate, but has faster training
                          thresh = 1e-3,
                          # again lower number of iterations for faster training
                          maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))

# Since the classifier is now trained, it is possible to plot the area under the curve (AUC).

# It is also possible to compute the AUC as a figure.
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

# Now that the model has been fitted to the DTM, it is time for validating the model's performance on the test data.
Therefore, the same preprocessing and tokenization functions will be applied as well as the same vectorizer function.

# Preprocessing and tokenizing test data
it_test = itoken(test$review, 
              preprocessor = prep_fun, 
              tokenizer = tok_fun, 
              ids = test$id, 
              progressbar = FALSE)

# Creating DTM of test data
dtm_test = create_dtm(it_test, vectorizer)

# Using classifier for prediction on test data
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
glmnet:::auc(test$sentiment, preds)



