我正在尝试根据此处提供的指南进行多标签文本分类:https://mlr-org.github.io/Multilabel-Classification-with-mlr/
我收到此错误: checkLearnerBeforeTrain(任务,学习者,体重)中的错误: 任务“ cottonseed.Class”是一个一类问题,但学习者“ classif.rpart”不支持该问题!
其中Cottonseed.Class是我的班级标签之一。我总共有117个班级标签,所以我不确定为什么会收到“一个班级问题”的错误
我的特征/单词(列)和文档(行)均来自文档术语矩阵。类标签是我data.frame末尾的列,每行(文档)的值均为TRUE / FALSE。
这是代码:
library(tm)
library(proxy)
library(RTextTools)
library(fpc)
library(wordcloud)
library(cluster)
library(stringi)
library(dplyr)
library(magrittr)
#install.packages("tm.corpus.Reuters21578", repos = "http://datacube.wu.ac.at")
library(tm.corpus.Reuters21578)
data(Reuters21578)
reuters = Reuters21578
# remove all documents that do not have topic category for classification (remaining 11367)
reuters = tm_filter(reuters, FUN = function(x) !identical(meta(x)[["topics_cat"]] , character(0)))
# some documents appear to be empty -> remove all empty docs (remaining 11305)
reuters = tm_filter(reuters, FUN = function(x) !identical(meta(x)[["heading"]] , character(0)))
# get the trainset and the testset
reuters_lewissplit = tm_filter(reuters, FUN = function(x) meta(x)[["lewissplit"]] == "TRAIN" || meta(x)[["lewissplit"]] == "TEST")
# extract all topics/categories from the train and test sets
allTopics_lewissplit <- sapply(reuters_lewissplit, function(x){x$meta$topics_cat})
classes = unique(unlist(sapply(reuters_lewissplit, function(x){x$meta$topics_cat}), recursive = FALSE, use.names = FALSE))
classes[order(classes)]
# remove dashes because package mlr complains
library(stringr)
classes <- str_replace(classes, "-", ".")
# data frame with logical representation of classes
classesDF = data.frame(matrix(FALSE, ncol = length(classes)+1, nrow = length(allTopics_lewissplit)))
# I am adding the .Class to each class name because mlr complains if the class name is the same as a feature name
classes = paste0(classes, ".Class")
colnames(classesDF) <- c(classes, c("TRAIN"))
for (i in 1:length(allTopics_lewissplit)) {
topics = unique(allTopics_lewissplit[[i]])
topics <- str_replace(topics, "-", ".")
topics = paste0(topics, ".Class")
classesDF[i,topics] = TRUE
if (meta(reuters_lewissplit[[i]])[["lewissplit"]] == "TRAIN") {
classesDF[i,"TRAIN"] = TRUE
}
}
# remove numbers
reuters_lewissplit <- tm_map(reuters_lewissplit, removeNumbers)
# eliminate extra white spaces
reuters_lewissplit <- tm_map(reuters_lewissplit, stripWhitespace)
# convert to lower case
reuters_lewissplit <- tm_map(reuters_lewissplit, content_transformer(tolower))
# remove stop words
reuters_lewissplit <- tm_map(reuters_lewissplit, removeWords, stopwords("english"))
# length(stopwords("english"))
# stopwords("english")
# remove punctuation
reuters_lewissplit <- tm_map(reuters_lewissplit, removePunctuation)
# create Document Term Matrix (DTM)
ndocs <- length(reuters_lewissplit)
# ignore extremely rare words i.e. terms that appear in less then 1% of the documents
minTermFreq <- ndocs * 0.01
# ignore overly common words i.e. terms that appear in more than 50% of the documents
maxTermFreq <- ndocs * .5
dtm = DocumentTermMatrix(reuters_lewissplit,
control = list(
wordLengths=c(4, 15),
bounds = list(global = c(minTermFreq, maxTermFreq)),
weighting = weightTfIdf
))
dtm.matrix = as.matrix(dtm)
####################################################################################################################
# Multilabel classification
####################################################################################################################
library(mlr)
# join the dtm with the class labels
tmp = cbind (data.frame(dtm.matrix), classesDF[, 1: 117])
target = classes
target
reuters.task = makeMultilabelTask(data = tmp, target = target)
# We set a seed, because the classifier chain wrapper uses a random chain order. Next, we train a learner.
# I chose the classifier chain approach together with a decision tree for the binary classification problems.
binary.learner = makeLearner("classif.rpart")
lrncc = makeMultilabelClassifierChainsWrapper(binary.learner)
# Now let’s train and predict on our dataset:
n = getTaskSize(reuters.task)
train.set = seq(1, 7733, by = 1)
test.set = seq(7734, 10741, by = 1)
set.seed(1729)
reuters.mod.cc = train(lrncc, reuters.task, subset = train.set)
reuters.pred.cc = predict(reuters.mod.cc, task = reuters.task, subset = test.set)
# common multilabel performance measures
listMeasures("multilabel")
## [1] "multilabel.f1" "multilabel.subset01" "multilabel.tpr"
## [4] "multilabel.ppv" "multilabel.acc" "timeboth"
## [7] "timepredict" "multilabel.hamloss" "featperc"
## [10] "timetrain"
# classifier chains method performance
performance(reuters.pred.cc, measures = list(multilabel.hamloss, multilabel.subset01, multilabel.f1, multilabel.acc))
它在以下行失败: reuters.mod.cc = train(lrncc,reuters.task,子集= train.set)
任何见解将不胜感激!
谢谢你, 劳拉