使用mlr软件包的多标签分类错误

时间:2018-07-23 21:15:35

标签: multilabel-classification mlr

我正在尝试根据此处提供的指南进行多标签文本分类:https://mlr-org.github.io/Multilabel-Classification-with-mlr/

我收到此错误: checkLearnerBeforeTrain(任务,学习者,体重)中的错误:   任务“ cottonseed.Class”是一个一类问题,但学习者“ classif.rpart”不支持该问题!

其中Cottonseed.Class是我的班级标签之一。我总共有117个班级标签,所以我不确定为什么会收到“一个班级问题”的错误

我的特征/单词(列)和文档(行)均来自文档术语矩阵。类标签是我data.frame末尾的列,每行(文档)的值均为TRUE / FALSE。

这是代码:

library(tm)
library(proxy)
library(RTextTools)
library(fpc)
library(wordcloud)
library(cluster)
library(stringi)
library(dplyr)
library(magrittr) 

#install.packages("tm.corpus.Reuters21578", repos = "http://datacube.wu.ac.at")
library(tm.corpus.Reuters21578)

data(Reuters21578)
reuters = Reuters21578

# remove all documents that do not have topic category for classification (remaining 11367)
reuters = tm_filter(reuters, FUN = function(x) !identical(meta(x)[["topics_cat"]] , character(0)))

# some documents appear to be empty -> remove all empty docs (remaining 11305)
reuters = tm_filter(reuters, FUN = function(x) !identical(meta(x)[["heading"]] , character(0)))

# get the trainset and the testset
reuters_lewissplit = tm_filter(reuters, FUN = function(x) meta(x)[["lewissplit"]] == "TRAIN" || meta(x)[["lewissplit"]] == "TEST")

# extract all topics/categories from the train and test sets
allTopics_lewissplit <- sapply(reuters_lewissplit, function(x){x$meta$topics_cat})
classes = unique(unlist(sapply(reuters_lewissplit, function(x){x$meta$topics_cat}), recursive = FALSE, use.names = FALSE))
classes[order(classes)]

# remove dashes because package mlr complains

library(stringr)
classes <- str_replace(classes, "-", ".")

# data frame with logical representation of classes
classesDF = data.frame(matrix(FALSE, ncol = length(classes)+1, nrow = length(allTopics_lewissplit)))
# I am adding the .Class to each class name because mlr complains if the class name is the same as a feature name
classes = paste0(classes, ".Class") 
colnames(classesDF) <- c(classes, c("TRAIN"))


for (i in 1:length(allTopics_lewissplit)) {
  topics =   unique(allTopics_lewissplit[[i]])
  topics <- str_replace(topics, "-", ".")
  topics = paste0(topics, ".Class")
  classesDF[i,topics] = TRUE
  if (meta(reuters_lewissplit[[i]])[["lewissplit"]] == "TRAIN") {
    classesDF[i,"TRAIN"] = TRUE
  }
}

# remove numbers
reuters_lewissplit <- tm_map(reuters_lewissplit, removeNumbers)

# eliminate extra white spaces
reuters_lewissplit <- tm_map(reuters_lewissplit, stripWhitespace)

# convert to lower case
reuters_lewissplit <- tm_map(reuters_lewissplit, content_transformer(tolower))

# remove stop words
reuters_lewissplit <- tm_map(reuters_lewissplit, removeWords, stopwords("english"))

# length(stopwords("english"))
# stopwords("english")

# remove punctuation
reuters_lewissplit <- tm_map(reuters_lewissplit, removePunctuation)


# create Document Term Matrix (DTM)
ndocs <- length(reuters_lewissplit)
# ignore extremely rare words i.e. terms that appear in less then 1% of the documents
minTermFreq <- ndocs * 0.01
# ignore overly common words i.e. terms that appear in more than 50% of the documents
maxTermFreq <- ndocs * .5


dtm = DocumentTermMatrix(reuters_lewissplit,
                         control = list(
                           wordLengths=c(4, 15),
                           bounds = list(global = c(minTermFreq, maxTermFreq)), 
                           weighting = weightTfIdf
                         ))


dtm.matrix = as.matrix(dtm)

####################################################################################################################
# Multilabel classification 
####################################################################################################################

library(mlr)

# join the dtm with the class labels
tmp = cbind (data.frame(dtm.matrix), classesDF[, 1: 117]) 

target = classes
target

reuters.task = makeMultilabelTask(data = tmp, target = target)

# We set a seed, because the classifier chain wrapper uses a random chain order. Next, we train a learner. 
# I chose the classifier chain approach together with a decision tree for the binary classification problems.

binary.learner = makeLearner("classif.rpart")
lrncc = makeMultilabelClassifierChainsWrapper(binary.learner)


# Now let’s train and predict on our dataset:

n = getTaskSize(reuters.task)
train.set = seq(1, 7733, by = 1)
test.set = seq(7734, 10741, by = 1)

set.seed(1729)
reuters.mod.cc = train(lrncc, reuters.task, subset = train.set)
reuters.pred.cc = predict(reuters.mod.cc, task = reuters.task, subset = test.set)

# common multilabel performance measures
listMeasures("multilabel")

##  [1] "multilabel.f1"       "multilabel.subset01" "multilabel.tpr"
##  [4] "multilabel.ppv"      "multilabel.acc"      "timeboth"
##  [7] "timepredict"         "multilabel.hamloss"  "featperc"
## [10] "timetrain"

# classifier chains method performance

performance(reuters.pred.cc, measures = list(multilabel.hamloss, multilabel.subset01, multilabel.f1, multilabel.acc))

它在以下行失败: reuters.mod.cc = train(lrncc,reuters.task,子集= train.set)

任何见解将不胜感激!

谢谢你, 劳拉

0 个答案:

没有答案