我们准备了机器学习算法,如具有特征因素的分类算法。对文本数据为英文的文本数据进行主题建模
下面编写的脚本。
complete <- subset(complete,select=c(Group,Type,Text,Target))
data <- complete$Text
corpus <-tm_map(corpus,content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "-")
corpus <- tm_map(corpus, toSpace, ":")
corpus <- tm_map(corpus, toSpace, ";")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\(" )
corpus <- tm_map(corpus, toSpace, ")")
corpus <- tm_map(corpus, toSpace, ",")
corpus <- tm_map(corpus, toSpace, "_")
corpus <- tm_map(corpus, content_transformer(removeSpecialChars))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus,stemDocument)
tdm <- DocumentTermMatrix(corpus)
train1 <- as.matrix(tdm)
complete1 <- subset(complete,select=c(Group,Type,Target))
complete1 <- Filter(function(x)(length(unique(x))>1), complete1)
train <- cbind(complete1, train1)
train$Text <- NULL
train$Target <- as.factor(train$Target)
############################################################################################
# Model Run
############################################################################################
fit <-svm(Target ~ ., data = train)
termlist <- list(dictionary = Terms(tdm))
retval <- list(model = fit, termlist = termlist, complete = complete)
saveRDS(retval, "./modelTarget.rds")
现在我们将期待其他语言的数据 - 中文/韩文/日文/法文/葡萄牙文/西班牙文。
想要检查R是否支持这些类型的数据,尤其是文本清理。 请建议