我正在尝试学习R.我有以下数据集,28个变量,其中5个是标签(Class, crown, root, trunk, collar
)。首先,我尝试使用RandomForest
根据类来预测树是否生病。
接着, 我需要预测树的哪一部分生病(冠,根,领,树干)
我需要使用多标签分类(randomForestSRC
)或其他包。
如果有人可以举例说明如何使用多标签分类(或将标签类转换为二进制形式),我真的很感激。
提前谢谢!
dput(ML)
structure(list(Sector = c(5L, 3L, 3L, 2L, 1L, 3L, 6L, 2L, 2L,
5L, 3L, 4L, 5L, 1L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 3L, 5L, 4L, 2L,
4L, 4L, 2L, 1L, 2L, 5L, 6L, 3L, 3L, 6L, 2L, 3L, 3L, 6L, 3L, 5L,
6L, 3L, 4L, 5L, 1L, 3L, 5L, 3L, 2L, 3L, 6L, 5L), Plantation.year = c(2014L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2009L, 2004L, 2004L, 2005L, 2004L, 2004L, 2004L, 2004L, 2009L,
2004L, 2006L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2009L, 2004L, 2005L, 2004L, 2004L, 2004L, 2004L, 2004L, 2014L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L), Diagnosis.year = c(2014L,
2013L, NA, 2014L, 2013L, 2015L, 2013L, 2014L, 2013L, 2015L, 2015L,
2014L, 2013L, 2015L, 2015L, 2015L, 2013L, 2015L, 2013L, 2013L,
2015L, 2014L, 2013L, 2013L, 2014L, 2013L, 2014L, 2015L, 2014L,
2014L, 2013L, 2014L, 2014L, 2013L, 2015L, 2014L, 2014L, 2013L,
2014L, 2015L, 2015L, 2015L, 2015L, 2013L, 2015L, 2014L, 2014L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L), Next.diagnosis.year = c(2019L,
NA, 2014L, 2014L, NA, 2018L, 2020L, 2014L, NA, 2017L, NA, 2014L,
2016L, 2018L, 2020L, NA, NA, 2016L, NA, 2018L, 2018L, 2014L,
2016L, 2014L, 2014L, NA, 2015L, NA, 2014L, NA, NA, 2014L, NA,
NA, 2018L, 2017L, 2014L, NA, 2014L, 2020L, 2017L, 2017L, 2016L,
NA, 2018L, 2020L, 2019L, NA, NA, NA, NA, 2017L, 2018L), Stump.diameter = structure(c(2L,
6L, 2L, 7L, 5L, 7L, 6L, 6L, 7L, 7L, 2L, 7L, 5L, 2L, 5L, 2L, 5L,
5L, 1L, 6L, 4L, 1L, 6L, 6L, 2L, 7L, 3L, 5L, 2L, 7L, 6L, 6L, 2L,
10L, 2L, 7L, 6L, 2L, 5L, 2L, 5L, 2L, 6L, 5L, 5L, 2L, 6L, 1L,
8L, 2L, 9L, 8L, 11L), .Label = c("0 Ã 10 cm", "10 Ã 20 cm",
"100 Ã 110 cm", "110 Ã 120 cm", "20 Ã 30 cm", "30 Ã 40 cm",
"40 Ã 50 cm", "50 Ã 60 cm", "60 Ã 70 cm", "70 Ã 80 cm", "80 Ã 90 cm"
), class = "factor"), Species = structure(c(4L, 1L, 6L, 7L, 9L,
9L, 5L, 8L, 1L, NA, NA, 13L, 7L, 15L, NA, 12L, 11L, 7L, 9L, 1L,
8L, 15L, 8L, 13L, 11L, 9L, 1L, 8L, 4L, 14L, 8L, 1L, 9L, 7L, 9L,
2L, 8L, 9L, 8L, NA, 12L, 3L, 9L, 7L, 12L, 9L, 10L, 9L, 9L, 1L,
11L, 13L, 1L), .Label = c("acerifolia", "betulus", "campestris",
"cordata", "excelsior", "grandiflora", "japonica", "nigra", "Other ",
"platanoides", "pseudoplatanus", "styraciflua", "tomentosa",
"tulipifera", "verrucosa"), class = "factor"), Traffic.frequence = structure(c(2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L,
1L, 1L, 1L, 1L), .Label = c("passages fréquents et arrêts fréquents",
"passages fréquents ou arrêts", "quelques passages"), class = "factor"),
Botanical.category = structure(c(2L, 16L, 11L, 19L, 8L, 14L,
8L, 17L, 16L, 1L, 20L, 20L, 19L, 3L, 4L, 9L, 1L, 19L, 13L,
16L, 17L, 3L, 15L, 20L, 1L, 7L, 16L, 15L, 20L, 10L, 15L,
16L, 13L, 19L, 6L, 4L, 15L, 5L, 15L, 4L, 9L, 1L, 13L, 19L,
9L, 12L, 1L, 7L, 18L, 16L, 1L, 20L, 16L), .Label = c("Acer",
"Alnus", "Betula", "Carpinus", "Celtis", "Cercis", "Cupressus",
"Fraxinus", "Liquidambar", "Liriodendron", "Magnolia", "Malus",
"Other ", "Picea", "Pinus", "Platanus", "Populus", "Robinia",
"Sophora", "Tilia"), class = "factor"), PLU.ident.number = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), PLU.ProtectionCateg = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), Diagnosis.remarks = structure(c(4L,
4L, 4L, 4L, 4L, 2L, 4L, 1L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L,
4L, 2L, 4L, 4L, 3L, 4L, 3L, 4L, 3L, 4L, 4L, 4L, 3L, 4L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Arbre à abattre dans les 10 ans",
"Arbre à abattre dans les 5 ans", "Arbre d'avenir incertain",
"Arbre d'avenir normal"), class = "factor"), Diagnosis.renewal.priority = structure(c(3L,
3L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("de 1 Ã 5 ans",
"de 11 Ã 20 ans", "plus de 20 ans"), class = "factor"),
Reasoning.planting = structure(c(1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), .Label = "Remplacement", class = "factor"), Subcategory = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("ESP151", "ESP174"
), class = "factor"), Development.stage = structure(c(2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("Arbre d'espaces ouverts",
"Arbre de voirie"), class = "factor"), STADEDEDEVELOPPEMENT = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Arbre adulte", "Arbre jeune",
"Arbre vieillissant"), class = "factor"), Development.stage.diagnosis = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Arbre adulte", "Arbre jeune",
"Arbre vieillissant"), class = "factor"), Caterpillar.treat.priority = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, 2L, NA, NA, 1L,
NA, NA, NA, NA, NA, 1L, NA, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), .Label = c("Haute", "Moyenne"
), class = "factor"), Recommended.treatment = structure(c(2L,
NA, 2L, 8L, NA, 3L, 6L, 6L, NA, 6L, NA, 9L, 5L, 7L, 7L, NA,
NA, 1L, NA, 7L, 7L, 4L, 6L, 9L, 6L, NA, 6L, NA, 2L, NA, NA,
7L, NA, NA, 7L, 6L, 7L, NA, 7L, 7L, 7L, 7L, 7L, NA, 7L, 2L,
6L, NA, NA, NA, NA, 6L, 5L), .Label = c("Abattage", "Controle",
"Controle résistographe", "Controle tuteur, attache ou protection",
"Taille d'éclaircie", "Taille de bois mort", "Taille formation et mise au gabarit",
"Taille mise en sécurité", "Taille rideau"), class = "factor"),
Sidewalk = structure(c(1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("non",
"oui"), class = "factor"), PLU.spatial.arrangement = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), Variety = structure(c(NA, NA,
NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, NA, NA, 4L, NA, NA,
NA, NA, NA, NA, NA, 1L, NA, NA, 2L, NA, 1L, NA, NA, 1L, NA,
NA, NA, NA, NA, 1L, NA, 1L, 4L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), .Label = c("Austriaca", "Glauca",
"Italica", "Pyramidalis"), class = "factor"), Vigor = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 1L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, NA,
2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("vieillissement dépérissement",
"vigoureux", "vigueur intermédiaire"), class = "factor"),
Class = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Collar = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L), Crown = c(0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L), Root = c(0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Trunk = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), .Names = c("Sector",
"Plantation.year", "Diagnosis.year", "Next.diagnosis.year", "Stump.diameter",
"Species", "Traffic.frequence", "Botanical.category", "PLU.ident.number",
"PLU.ProtectionCateg", "Diagnosis.remarks", "Diagnosis.renewal.priority",
"Reasoning.planting", "Subcategory", "Development.stage", "STADEDEDEVELOPPEMENT",
"Development.stage.diagnosis", "Caterpillar.treat.priority",
"Recommended.treatment", "Sidewalk", "PLU.spatial.arrangement",
"Variety", "Vigor", "Class", "Collar", "Crown", "Root", "Trunk"
), class = "data.frame", row.names = c(NA, -53L))
答案 0 :(得分:0)
根据您的上述问题,您遇到的问题似乎是变量有标签而且在某种程度上不是数字?解决这个问题最简单的方法是在矩阵中为所有5个变量标记添加另一个变量,并使它们具有相应的数字。所以Acer = 1,Alnus = 2,依此类推(class变为class.id)
现在,使用randomforest软件包时,必须将分类预测变量指定为因子,否则它们将被错误地视为连续变量。所以要转换它们,只需添加代码:
as.factor(class.id)
is.factor(class.id)
然后,您可以使用randomforest包中的随机林代码。我无法从你上面列出的变量矩阵中找出,哪些变量决定一棵树/物种是否生病,但实质上是:
sickness<-randomforest(class.id~variable1, variable2, etc., data=matrix, ntree=4000, importance=TRUE, proximity=TRUE
print(sickness)
importance(sickness)
varImpPlot(sickness)