我正在尝试解决一个案例研究,其中有4列(产品标题,描述,搜索查询和装修得分),我必须使用文本特征(标题和描述)来预测一个连续变量(装修得分)。连续变量是拟合得分,以1-3的比例给出,其中1为最差表现,3为完美。装修得分表示针对描述和产品标题列的搜索查询的拟合程度。我首先使用tm包清理了r中的文本列...然后进行了回归,但是我的R平方非常低
我的下面的代码;
topic_data <- read_excel("Scientist_Stage1_Test.xlsx",sheet = 2)
#Create a dtm
#corp <- Corpus(VectorSource(topic_data$title))
#corp <-
#corp %>% tm_map(., FUN = tolower) %>% tm_map(., FUN = removeWords,
stopwords("english")) %>% tm_map(., stripWhitespace)
#dtm <- DocumentTermMatrix(corp)
# function to clean documents and return a document-term matrix - Title
clean_text_dtm <- function(text_vector){
corpus = gsub("\\s+", " ", str_trim(text_vector)) # remove extra sapces
corpus = Corpus(VectorSource(corpus)) #converting the entire text in corpus
to perform cleaning further with tm package
corpus = tm_map(corpus, content_transformer(tolower)) # converting the text
to lower case
corpus = tm_map(corpus, removePunctuation) # removing the punctuation
corpus = tm_map(corpus, removeNumbers) # remmoving all numbers (digits)
stop_words <- c("in", "x","d","h","w","up to","of the","can be","with
the","for each","to the")
corpus = tm_map(corpus, removeWords,c(stopwords("english"),stop_words)) #
removing stop words
#STEMMING不会执行,因为本质有时会丢失
corpusDTM = DocumentTermMatrix(corpus, list(minWordLength = 1)) # converting
corpus into document-term matrix
#corpusDTM <- removeSparseTerms(corpusDTM, sparse = 0.9999)
}
translated.dtm <- clean_text_dtm(topic_data$title)
feature_df <- as.data.frame(as.matrix(translated.dtm))
#Create a dtm1
#corp1 <- Corpus(VectorSource(topic_data$description))
#corp1 <-
#corp1 %>% tm_map(., FUN = tolower) %>% tm_map(., FUN = removeWords,
stopwords("english")) %>% tm_map(., stripWhitespace)
#dtm1 <- DocumentTermMatrix(corp1)
#dtm <- removeSparseTerms(x = dtm,sparse = 0.995)
# function to clean documents and return a document-term matrix -
description
clean_text_dtm <- function(text_vector){
corpus = gsub("\\s+", " ", str_trim(text_vector)) # remove extra sapces
corpus = Corpus(VectorSource(corpus)) #converting the entire text in corpus
to perform cleaning further with tm package
corpus = tm_map(corpus, content_transformer(tolower)) # converting the text
to lower case
corpus = tm_map(corpus, removePunctuation) # removing the punctuation
corpus = tm_map(corpus, removeNumbers) # removing all numbers (digits)
stop_words <- c("may vary","easy to","to the","on the","to your","it
is","of","and is","from the","to help","and in","to in","in your")
corpus = tm_map(corpus, removeWords,c(stopwords("english"),stop_words)) #
removing stop words
# STEMMING is not performed as the essence is lost sometimes
corpusDTM = DocumentTermMatrix(corpus, list(minWordLength = 1)) #
converting corpus into document-term matrix
#corpusDTM <- removeSparseTerms(corpusDTM, sparse = 0.9999)
}
translated.dtm1 <- clean_text_dtm(topic_data$description)
feature_df1 <- as.data.frame(as.matrix(translated.dtm1))
topic_data <- cbind(feature_df1,topic_data,feature_df)
#topic_data <- cbind(feature_df,topic_data)
#colnames(topic_data)[7:length(topic_data)] <-
paste0("var",7:length(topic_data))
###建立模型###
###splitting our data into a training set and a testing set
Train <- subset(topic_data, topic_data$TrainingData == 1)
Test <- subset(topic_data, topic_data$TrainingData == 0)
##Training our model###
model <- lm(fitment_score ~ title + description , Train)
summary(model)
Topic_Prediction <- predict(model, Test)
results <- cbind(Topic_Prediction,Test$fitment_score)
colnames(results) <- c('pred','real')
results <- as.data.frame(results)
to_zero <- function(x){
if (x < 0){
return(0)
}else{
return(x)
}
}
results$pred <- sapply(results$pred,to_zero)
mse <- mean((results$real-results$pred)^2)
print(mse)
mse^0.5
SSE = sum((results$pred - results$real)^2)
SST = sum((mean(topic_data$fitment_score) - results$real)^2)
R2 <- 1 - SSE/SST
R2