使用R的文本分类算法

时间:2017-07-19 10:45:33

标签: r nlp text-classification

我想使用文字“description”和“class”的历史数据来预测新文档的类别

我正在使用的脚本下面,但是对于我想要预测的新文档我没有获得更好的准确性,任何人都可以帮助我知道可以使用哪种算法来提高准确性。请指教。

library(plyr)
library(tm)
library(e1071)

setwd("C:/Data")

past <- read.csv("Past - Copy.csv",header=T,na.strings=c(""))
future <- read.csv("Future - Copy.csv",header=T,na.strings=c(""))

training <- rbind.fill(past,future)

Res_Desc_Train <- subset(training,select=c("Class","Description"))

##Step 1 : Create Document Matrix of ticket Descriptions available past data

docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))

#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords('english'))


#inspect(docs[440])
dataframe<-data.frame(text=unlist(sapply(docs, `[`, "content")), stringsAsFactors=F)

dtm <- DocumentTermMatrix(docs,control=list(stopwords=FALSE,wordLengths =c(2,Inf)))

##Let's remove the variables which are 95% or more sparse.
dtm <- removeSparseTerms(dtm,sparse = 0.95)

Weighteddtm <- weightTfIdf(dtm,normalize=TRUE)
mat.df <- as.data.frame(data.matrix(Weighteddtm), stringsAsfactors = FALSE)
mat.df <- cbind(mat.df, Res_Desc_Train$Class)
colnames(mat.df)[ncol(mat.df)] <- "Class"
Assignment.Distribution <- table(mat.df$Class)

Res_Desc_Train_Assign <- mat.df$Class

Assignment.Distribution <- table(mat.df$Class)

### Feature has different ranges, normalizing to bring ranges from 0 to 1
### Another way to standardize using z-scores

normalize <- function(x) {
  y <- min(x)
  z <- max(x)
  temp <- x - y
  temp1 <- (z - y)
  temp2 <- temp / temp1
  return(temp2)
}
#normalize(c(1,2,3,4,5))

num_col <- ncol(mat.df)-1
mat.df_normalize <- as.data.frame(lapply(mat.df[,1:num_col], normalize))
mat.df_normalize <- cbind(mat.df_normalize, Res_Desc_Train_Assign)
colnames(mat.df_normalize)[ncol(mat.df_normalize)] <- "Class"

#names(mat.df)
outcomeName <- "Class"

train = mat.df_normalize[c(1:nrow(past)),]
test = mat.df_normalize[((nrow(past)+1):nrow(training)),]


train$Class <- as.factor(train$Class) 

###SVM Model
x <- subset(train, select = -Class)
y <- train$Class
model <- svm(x, y, probability = TRUE) 
test1 <- subset(test, select = -Class)
svm.pred <- predict(model, test1, decision.values = TRUE, probability = TRUE)
svm_prob <- attr(svm.pred, "probabilities")

finalresult <- cbind(test,svm.pred,svm_prob)

1 个答案:

答案 0 :(得分:0)

让我们尝试调整您的SVM模型?

您正在使用默认参数运行模型,因此无法获得更好的准确性。运行模型是一个迭代过程,您可以在其中更改参数,运行模型,检查准确性,然后再次重复整个过程。

model <- tune(svm, train.x=x, train.y=y, kernel="radial", ranges=list(cost=10^(-1:2), gamma=c(.5,1,2)))
print(model)
#select values of cost & gamma from here and pass it to tuned_model

tuned_model <- svm(x, y, kernel="radial", cost=<cost_from_tune_model_output>, gamma=<gamma_from_tune_model_output>)
#now check accuracy of this model using test dataset and accordingly adjust tune parameter. Repeat the whole process again. 

希望这有帮助!