Question

我正在尝试解决一个案例研究，其中有4列（产品标题，描述，搜索查询和装修得分），我必须使用文本特征（标题和描述）来预测一个连续变量（装修得分）。连续变量是拟合得分，以1-3的比例给出，其中1为最差表现，3为完美。装修得分表示针对描述和产品标题列的搜索查询的拟合程度。我首先使用tm包清理了r中的文本列...然后进行了回归，但是我的R平方非常低

我的下面的代码；

topic_data <- read_excel("Scientist_Stage1_Test.xlsx",sheet = 2)
#Create a dtm
#corp <- Corpus(VectorSource(topic_data$title))
#corp <-
#corp %>% tm_map(., FUN = tolower) %>% tm_map(., FUN = removeWords, 
stopwords("english")) %>% tm_map(., stripWhitespace)
#dtm <- DocumentTermMatrix(corp)


# function to clean documents and return a document-term matrix - Title
clean_text_dtm <- function(text_vector){
corpus = gsub("\\s+", " ", str_trim(text_vector)) # remove extra sapces
corpus = Corpus(VectorSource(corpus)) #converting the entire text in corpus 
to perform cleaning further with tm package
corpus = tm_map(corpus, content_transformer(tolower)) # converting the text 
to lower case
corpus = tm_map(corpus, removePunctuation) # removing the punctuation
corpus = tm_map(corpus, removeNumbers) # remmoving all numbers (digits)

stop_words <- c("in", "x","d","h","w","up to","of the","can be","with 
the","for each","to the")
corpus = tm_map(corpus, removeWords,c(stopwords("english"),stop_words)) # 
removing stop words

＃STEMMING不会执行，因为本质有时会丢失

corpusDTM = DocumentTermMatrix(corpus, list(minWordLength = 1)) # converting 
corpus into document-term matrix
#corpusDTM <- removeSparseTerms(corpusDTM, sparse = 0.9999)

}

translated.dtm <- clean_text_dtm(topic_data$title)

feature_df <- as.data.frame(as.matrix(translated.dtm))

#Create a dtm1
#corp1 <- Corpus(VectorSource(topic_data$description))

#corp1 <-
#corp1 %>% tm_map(., FUN = tolower) %>% tm_map(., FUN = removeWords, 
stopwords("english")) %>% tm_map(., stripWhitespace)
#dtm1 <- DocumentTermMatrix(corp1)

#dtm <- removeSparseTerms(x = dtm,sparse = 0.995)


# function to clean documents and return a document-term matrix - 
description
clean_text_dtm <- function(text_vector){
corpus = gsub("\\s+", " ", str_trim(text_vector)) # remove extra sapces
corpus = Corpus(VectorSource(corpus)) #converting the entire text in corpus 
to perform cleaning further with tm package
corpus = tm_map(corpus, content_transformer(tolower)) # converting the text 
to lower case
corpus = tm_map(corpus, removePunctuation) # removing the punctuation
corpus = tm_map(corpus, removeNumbers) # removing all numbers (digits)

stop_words <- c("may vary","easy to","to the","on the","to your","it 
is","of","and is","from the","to help","and in","to in","in your")
corpus = tm_map(corpus, removeWords,c(stopwords("english"),stop_words)) # 
removing stop words

# STEMMING is not performed as the essence is lost sometimes

 corpusDTM = DocumentTermMatrix(corpus, list(minWordLength = 1)) # 
 converting corpus into document-term matrix
 #corpusDTM <- removeSparseTerms(corpusDTM, sparse = 0.9999)

 }

translated.dtm1 <- clean_text_dtm(topic_data$description)


feature_df1 <- as.data.frame(as.matrix(translated.dtm1))
topic_data <- cbind(feature_df1,topic_data,feature_df)

#topic_data <- cbind(feature_df,topic_data)
#colnames(topic_data)[7:length(topic_data)] <- 
paste0("var",7:length(topic_data))

###建立模型###

使用lm（）建立模型

###splitting our data into a training set and a testing set
Train <- subset(topic_data, topic_data$TrainingData == 1)
Test <- subset(topic_data, topic_data$TrainingData == 0)

##Training our model###
model <- lm(fitment_score ~ title + description , Train)
summary(model)

Topic_Prediction <- predict(model, Test)
results <- cbind(Topic_Prediction,Test$fitment_score)
colnames(results) <- c('pred','real')
results <- as.data.frame(results)

to_zero <- function(x){
if (x < 0){
return(0)
}else{
 return(x)
 }
 }

results$pred <- sapply(results$pred,to_zero)

mse <- mean((results$real-results$pred)^2)
print(mse)

mse^0.5
SSE = sum((results$pred - results$real)^2)
SST = sum((mean(topic_data$fitment_score) - results$real)^2)
R2 <- 1 - SSE/SST
R2

r平方低

使用lm（）建立模型

0 个答案: