我正在尝试在我的 OutVals(异常值)上使用 $names 运算符来查找这些异常值所关联的类,然后将异常值及其类名放入数据框中,以便我可以清楚地看到这些异常值来自哪个类来自。
然而,在尝试实现这一点时,我的类名返回为“1”、“2”等......而不是“Van”、“Bus 等......”,因为它在数据集中。
我是不是错过了什么,还是我的方法完全错误? 目标是获取数据中的异常值并将它们放在一个表中,该表显示异常值来自哪个类
任何帮助将不胜感激
我在下面展示了我的数据框以及我的可复制代码
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers function
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
namesforgroups <- boxplot(OutVals)$names #get group name of the outliers
dataf <- as.data.frame(OutVals, col.names = namesforgroups)#dataframe of outlier + names
print(OutVals) # show all outliers
remOutliers <- sapply(data, function(x) x[!x %in% OutVals]) #remove outliers from data
return (remOutliers)
}
#Remove class column and sample number
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2 #assign to new variable
vehClass <- vehData$Class #store original class names
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData) #remove first set of outliers
removeOutliers2 <- removeOutliers(removeOutliers1) #test again for more and remove
答案 0 :(得分:1)
箱线图对象中未提供有关离群值与哪个行/类名称相关联的信息。你必须自己得到它。给出的是异常值来自 boxplot(data)$group
内的列,因此您可以使用 which
查看它来自哪一行,并使用它来获取它是什么类。我重写了你的函数,它现在打印一个表,其中包含异常值、它来自的列以及它来自的行/类。第一次迭代中有 3 行中的 5 个异常值,第二次迭代中没有异常值 - 这是有道理的,因为它们已被删除。
removeOutliers <- function(data, class) {
x=boxplot(data)
OutVals <- x$out
columns <- x$group #get group name of the outliers
ind=numeric()
classes=c()
if (length(columns) > 0) {
for (i in 1:length(columns)) {
rows=which(data[,columns[i]]==OutVals[i])
ind=union(ind, rows)
classes=c(classes, class[rows])
}
dt=data.frame(OutVals, columns, classes) # show all outliers
print(dt)
return (list(data[-ind,], class[-ind]))
}
return(list(data, class))
}
#Remove class column and sample number
vehData1 <- vehData[, -c(1,20)]
vehClass <- vehData$Class #store original class names
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData1, vehClass) #remove first set of outliers
OutVals columns classes
1 103 5 bus
2 52 6 bus
3 6 6 bus
4 127 14 bus
5 14 15 saab
removeOutliers2 <- removeOutliers(removeOutliers1[[1]], removeOutliers1[[2]])
第一个函数返回一个删除了异常行的数据框。第二个函数返回一个表,其中包含有关每个异常值(类、列和值)的信息。
removeOutliers=function(data) {
x=boxplot(data %>% select(-Class), plot=FALSE)
outlierRows=c()
for (i in 1:length(x$out)) {
outlierRows=c(outlierRows, which(data[,x$group[i]]==x$out[i]))
}
return(data[-outlierRows,])
}
getOutliers=function(data) {
x=boxplot(data %>% select(-Class))
outlierInfo=data.frame()
for (i in 1:length(x$out)) {
rows=which(data[,x$group[i]]==x$out[i])
outlierInfo=bind_rows(outlierInfo, data.frame(class=data$Class[rows],
value=x$out[i],
column=names(data)[x$group[i]]))
}
return(outlierInfo)
}
removeOutliers(vehData)
Samples Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect
1 1 95 48 83 178 72 10 162 42 20 159
2 2 91 41 84 141 57 9 149 45 19 143
4 4 93 41 82 159 63 9 144 46 19 143
Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra Class
1 176 379 184 70 6 16 187 197 van
2 170 330 158 72 9 14 189 199 van
4 160 309 127 63 6 10 199 207 van
getOutliers(vehData)
class value column
1 bus 103 Pr.Axis.Ra
2 bus 52 Max.L.Ra
3 bus 6 Max.L.Ra
4 bus 127 Skew.Maxis
5 saab 14 Skew.maxis