多标签分类R

时间:2017-06-03 20:50:01

标签: r

我正在尝试学习R.我有以下数据集,28个变量,其中5个是标签(Class, crown, root, trunk, collar)。首先,我尝试使用RandomForest根据类来预测树是否生病。

接着, 我需要预测树的哪一部分生病(冠,根,领,树干)

我需要使用多标签分类(randomForestSRC)或其他包。

如果有人可以举例说明如何使用多标签分类(或将标签类转换为二进制形式),我真的很感激。

提前谢谢!

dput(ML)
structure(list(Sector = c(5L, 3L, 3L, 2L, 1L, 3L, 6L, 2L, 2L, 
5L, 3L, 4L, 5L, 1L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 3L, 5L, 4L, 2L, 
4L, 4L, 2L, 1L, 2L, 5L, 6L, 3L, 3L, 6L, 2L, 3L, 3L, 6L, 3L, 5L, 
6L, 3L, 4L, 5L, 1L, 3L, 5L, 3L, 2L, 3L, 6L, 5L), Plantation.year = c(2014L, 
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 
2009L, 2004L, 2004L, 2005L, 2004L, 2004L, 2004L, 2004L, 2009L, 
2004L, 2006L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 
2009L, 2004L, 2005L, 2004L, 2004L, 2004L, 2004L, 2004L, 2014L, 
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L), Diagnosis.year = c(2014L, 
2013L, NA, 2014L, 2013L, 2015L, 2013L, 2014L, 2013L, 2015L, 2015L, 
2014L, 2013L, 2015L, 2015L, 2015L, 2013L, 2015L, 2013L, 2013L, 
2015L, 2014L, 2013L, 2013L, 2014L, 2013L, 2014L, 2015L, 2014L, 
2014L, 2013L, 2014L, 2014L, 2013L, 2015L, 2014L, 2014L, 2013L, 
2014L, 2015L, 2015L, 2015L, 2015L, 2013L, 2015L, 2014L, 2014L, 
2013L, 2013L, 2013L, 2013L, 2013L, 2013L), Next.diagnosis.year = c(2019L, 
NA, 2014L, 2014L, NA, 2018L, 2020L, 2014L, NA, 2017L, NA, 2014L, 
2016L, 2018L, 2020L, NA, NA, 2016L, NA, 2018L, 2018L, 2014L, 
2016L, 2014L, 2014L, NA, 2015L, NA, 2014L, NA, NA, 2014L, NA, 
NA, 2018L, 2017L, 2014L, NA, 2014L, 2020L, 2017L, 2017L, 2016L, 
NA, 2018L, 2020L, 2019L, NA, NA, NA, NA, 2017L, 2018L), Stump.diameter = structure(c(2L, 
6L, 2L, 7L, 5L, 7L, 6L, 6L, 7L, 7L, 2L, 7L, 5L, 2L, 5L, 2L, 5L, 
5L, 1L, 6L, 4L, 1L, 6L, 6L, 2L, 7L, 3L, 5L, 2L, 7L, 6L, 6L, 2L, 
10L, 2L, 7L, 6L, 2L, 5L, 2L, 5L, 2L, 6L, 5L, 5L, 2L, 6L, 1L, 
8L, 2L, 9L, 8L, 11L), .Label = c("0 Ã  10 cm", "10 Ã  20 cm", 
"100 Ã  110 cm", "110 Ã  120 cm", "20 Ã  30 cm", "30 Ã  40 cm", 
"40 Ã  50 cm", "50 Ã  60 cm", "60 Ã  70 cm", "70 Ã  80 cm", "80 Ã  90 cm"
), class = "factor"), Species = structure(c(4L, 1L, 6L, 7L, 9L, 
9L, 5L, 8L, 1L, NA, NA, 13L, 7L, 15L, NA, 12L, 11L, 7L, 9L, 1L, 
8L, 15L, 8L, 13L, 11L, 9L, 1L, 8L, 4L, 14L, 8L, 1L, 9L, 7L, 9L, 
2L, 8L, 9L, 8L, NA, 12L, 3L, 9L, 7L, 12L, 9L, 10L, 9L, 9L, 1L, 
11L, 13L, 1L), .Label = c("acerifolia", "betulus", "campestris", 
"cordata", "excelsior", "grandiflora", "japonica", "nigra", "Other ", 
"platanoides", "pseudoplatanus", "styraciflua", "tomentosa", 
"tulipifera", "verrucosa"), class = "factor"), Traffic.frequence = structure(c(2L, 
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 
1L, 1L, 1L, 1L), .Label = c("passages fréquents et arrêts fréquents", 
"passages fréquents ou arrêts", "quelques passages"), class = "factor"), 
    Botanical.category = structure(c(2L, 16L, 11L, 19L, 8L, 14L, 
    8L, 17L, 16L, 1L, 20L, 20L, 19L, 3L, 4L, 9L, 1L, 19L, 13L, 
    16L, 17L, 3L, 15L, 20L, 1L, 7L, 16L, 15L, 20L, 10L, 15L, 
    16L, 13L, 19L, 6L, 4L, 15L, 5L, 15L, 4L, 9L, 1L, 13L, 19L, 
    9L, 12L, 1L, 7L, 18L, 16L, 1L, 20L, 16L), .Label = c("Acer", 
    "Alnus", "Betula", "Carpinus", "Celtis", "Cercis", "Cupressus", 
    "Fraxinus", "Liquidambar", "Liriodendron", "Magnolia", "Malus", 
    "Other ", "Picea", "Pinus", "Platanus", "Populus", "Robinia", 
    "Sophora", "Tilia"), class = "factor"), PLU.ident.number = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), PLU.ProtectionCateg = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), Diagnosis.remarks = structure(c(4L, 
    4L, 4L, 4L, 4L, 2L, 4L, 1L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 
    4L, 2L, 4L, 4L, 3L, 4L, 3L, 4L, 3L, 4L, 4L, 4L, 3L, 4L, 3L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Arbre à abattre dans les 10 ans", 
    "Arbre à abattre dans les 5 ans", "Arbre d'avenir incertain", 
    "Arbre d'avenir normal"), class = "factor"), Diagnosis.renewal.priority = structure(c(3L, 
    3L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 
    2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 2L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("de 1 Ã  5 ans", 
    "de 11 Ã  20 ans", "plus de 20 ans"), class = "factor"), 
    Reasoning.planting = structure(c(1L, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA), .Label = "Remplacement", class = "factor"), Subcategory = structure(c(1L, 
    1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
    1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("ESP151", "ESP174"
    ), class = "factor"), Development.stage = structure(c(2L, 
    2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 
    2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("Arbre d'espaces ouverts", 
    "Arbre de voirie"), class = "factor"), STADEDEDEVELOPPEMENT = structure(c(2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Arbre adulte", "Arbre jeune", 
    "Arbre vieillissant"), class = "factor"), Development.stage.diagnosis = structure(c(2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 
    1L, 1L, 2L, 1L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Arbre adulte", "Arbre jeune", 
    "Arbre vieillissant"), class = "factor"), Caterpillar.treat.priority = structure(c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 2L, NA, NA, 1L, 
    NA, NA, NA, NA, NA, 1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), .Label = c("Haute", "Moyenne"
    ), class = "factor"), Recommended.treatment = structure(c(2L, 
    NA, 2L, 8L, NA, 3L, 6L, 6L, NA, 6L, NA, 9L, 5L, 7L, 7L, NA, 
    NA, 1L, NA, 7L, 7L, 4L, 6L, 9L, 6L, NA, 6L, NA, 2L, NA, NA, 
    7L, NA, NA, 7L, 6L, 7L, NA, 7L, 7L, 7L, 7L, 7L, NA, 7L, 2L, 
    6L, NA, NA, NA, NA, 6L, 5L), .Label = c("Abattage", "Controle", 
    "Controle résistographe", "Controle tuteur, attache ou protection", 
    "Taille d'éclaircie", "Taille de bois mort", "Taille formation et mise au gabarit", 
    "Taille mise en sécurité", "Taille rideau"), class = "factor"), 
    Sidewalk = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 
    1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
    1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("non", 
    "oui"), class = "factor"), PLU.spatial.arrangement = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), Variety = structure(c(NA, NA, 
    NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, NA, 4L, NA, NA, 
    NA, NA, NA, NA, NA, 1L, NA, NA, 2L, NA, 1L, NA, NA, 1L, NA, 
    NA, NA, NA, NA, 1L, NA, 1L, 4L, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), .Label = c("Austriaca", "Glauca", 
    "Italica", "Pyramidalis"), class = "factor"), Vigor = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    3L, 1L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, NA, 
    2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("vieillissement dépérissement", 
    "vigoureux", "vigueur intermédiaire"), class = "factor"), 
    Class = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
    0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 
    0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Collar = c(0L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L), Crown = c(0L, 0L, 0L, 1L, 0L, 
    0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 0L, 0L), Root = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
    Trunk = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), .Names = c("Sector", 
"Plantation.year", "Diagnosis.year", "Next.diagnosis.year", "Stump.diameter", 
"Species", "Traffic.frequence", "Botanical.category", "PLU.ident.number", 
"PLU.ProtectionCateg", "Diagnosis.remarks", "Diagnosis.renewal.priority", 
"Reasoning.planting", "Subcategory", "Development.stage", "STADEDEDEVELOPPEMENT", 
"Development.stage.diagnosis", "Caterpillar.treat.priority", 
"Recommended.treatment", "Sidewalk", "PLU.spatial.arrangement", 
"Variety", "Vigor", "Class", "Collar", "Crown", "Root", "Trunk"
), class = "data.frame", row.names = c(NA, -53L))

1 个答案:

答案 0 :(得分:0)

根据您的上述问题,您遇到的问题似乎是变量有标签而且在某种程度上不是数字?解决这个问题最简单的方法是在矩阵中为所有5个变量标记添加另一个变量,并使它们具有相应的数字。所以Acer = 1,Alnus = 2,依此类推(class变为class.id)

现在,使用randomforest软件包时,必须将分类预测变量指定为因子,否则它们将被错误地视为连续变量。所以要转换它们,只需添加代码:

转换为因子

as.factor(class.id)

检查

is.factor(class.id)

然后,您可以使用randomforest包中的随机林代码。我无法从你上面列出的变量矩阵中找出,哪些变量决定一棵树/物种是否生病,但实质上是:

根据您的变量预测疾病

sickness<-randomforest(class.id~variable1, variable2, etc., data=matrix, ntree=4000, importance=TRUE, proximity=TRUE

此代码允许您查看重要性值以及绘制它们

print(sickness)
importance(sickness)
varImpPlot(sickness)