假设一个包含多个行和列的数据集,其中一些列为0(我的意思是列中的所有值都是0)。如何筛选出这些列?我尝试使用以下代码但无济于事。
training_data <- Filer(function(x) { !(all(x[, 1:99]==0))}, training_data)
更新:
对不起。在数据集中,并非所有列都是数字,因此我需要为列指定1:99的范围。
更新V2:
添加了我的数据集的一部分(使用dput
)
structure(list(label = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), f1 = c(15, 24, 10, 9, 6, 9), f2 = c(6,
14, 5, 4, 2, 4), f3 = c(6, 7, 2, 2, 1, 2), f4 = c(0, 0, 0, 0,
0, 0), f5 = c(9, 15, 6, 5, 3, 5), f6 = c(3, 7, 2, 2, 1, 2), f7 = c(1,
0, 0, 0, 0, 0), f8 = c(4, 11, 5, 4, 2, 4), f9 = c(5, 3, 0, 0,
0, 0), f10 = c(1, 3, 0, 0, 0, 0), f11 = c(1, 4, 2, 2, 1, 2),
f12 = c(0, 0, 0, 0, 0, 0), f13 = c(13, 15, 7, 6, 3, 6), f14 = c(0,
7, 1, 1, 1, 1), f15 = c(0, 0, 0, 0, 0, 0), f16 = c(20, 30,
11, 10, 6, 10), f17 = c(5, 0, 0, 0, 0, 0), f18 = c(0, 0,
0, 0, 0, 0), ft19 = c(28, 344, 399, 28, 82, 42), f20 = c(2.15,
15.64, 49.88, 4, 20.5, 6), f21 = c(0, 0, 0, 0, 0, 0), f22 = c(0,
0, 0, 0, 0, 0), f23 = c(6, 7, 2, 2, 1, 2), f24 = c(0, 0,
0, 0, 0, 0), f25 = c(19, 334, 395, 23, 79, 37), f26 = c(0,
26, 37, 6, 16, 7), f27 = c(11, 64, 101, 5, 17, 12), f28 = c(0,
0, 0, 0, 0, 0), f29 = c(2, 37, 101, 7, 26, 8), f30 = c(0,
18, 32, 2, 16, 4), f31 = c(0, 0, 0, 0, 0, 0), f32 = c(0,
0, 0, 0, 0, 0), f33 = c(3, 0, 1, 0, 1, 0), f34 = c(5, 44,
32, 4, 15, 5), f35 = c(0, 0, 0, 0, 0, 0), f36 = c(0, 0, 0,
0, 0, 0), f37 = c(0, 0, 0, 0, 0, 0), f38 = c(0, 0, 0, 0,
0, 0), f39 = c(6, 8, 10, 3, 2, 3), f40 = c(4, 6, 16, 4, 4,
3), f41 = c(18, 36, 37, 7, 5, 7), f42 = c(0, 18, 27, 0, 14,
1), f43 = c(0, 0, 0, 0, 0, 0), f44 = c(54, 743, 910, 65,
184, 100), f45 = c(14, 133, 91, 25, 18, 40), f46 = c(0, 0,
0, 0, 0, 0), f47 = c(4, 25, 17, 6, 6, 8), f48 = c(0, 0, 0,
0, 0, 0), f49 = c(0.46, 1, 1.5, 1.14, 1.5, 1.14), f50 = c(2.67,
1.86, 1.83, 1.88, 1.67, 1.88), f51 = c(3, 9, 1, 2, 1, 2),
f52 = c(0, 1, 2, 1, 1, 1), f53 = c(10, 12, 5, 4, 2, 4), f54 = c(0,
0, 0, 0, 0, 0), ft55 = c(3, 10, 3, 3, 2, 3), f56 = c(0.54,
0.07, 0.03, 0.32, 0.07, 0.21), f57 = c(0.21, 0.04, 0.01,
0.14, 0.02, 0.1), f58 = c(0.21, 0.02, 0.01, 0.07, 0.01, 0.05
), f59 = c(0, 0, 0, 0, 0, 0), f60 = c(0.32, 0.04, 0.02, 0.18,
0.04, 0.12), f61 = c(0.11, 0.02, 0.01, 0.07, 0.01, 0.05),
f62 = c(0.04, 0, 0, 0, 0, 0), f63 = c(0.14, 0.03, 0.01, 0.14,
0.02, 0.1), f64 = c(0.18, 0.01, 0, 0, 0, 0), f65 = c(0.04,
0.01, 0, 0, 0, 0), f66 = c(0.04, 0.01, 0.01, 0.07, 0.01,
0.05), f67 = c(0, 0, 0, 0, 0, 0), f68 = c(0.46, 0.04, 0.02,
0.21, 0.04, 0.14), f69 = c(0, 0.02, 0, 0.04, 0.01, 0.02),
f70 = c(0, 0, 0, 0, 0, 0), f71 = c(0.71, 0.09, 0.03, 0.36,
0.07, 0.24), f72 = c(0.18, 0, 0, 0, 0, 0), f73 = c(0, 0,
0, 0, 0, 0), f74 = c(1, 1, 1, 1, 1, 1), f75 = c(0.08, 0.05,
0.12, 0.14, 0.25, 0.14), f76 = c(0, 0, 0, 0, 0, 0), f77 = c(0,
0, 0, 0, 0, 0), f78 = c(0.21, 0.02, 0.01, 0.07, 0.01, 0.05
), f79 = c(0, 0, 0, 0, 0, 0), f80 = c(0.68, 0.97, 0.99, 0.82,
0.96, 0.88), f81 = c(0, 0.08, 0.09, 0.21, 0.2, 0.17), f82 = c(0.39,
0.19, 0.25, 0.18, 0.21, 0.29), f83 = c(0, 0, 0, 0, 0, 0),
f84 = c(0.07, 0.11, 0.25, 0.25, 0.32, 0.19), f85 = c(0, 0.05,
0.08, 0.07, 0.2, 0.1), f86 = c(0, 0, 0, 0, 0, 0), f87 = c(0,
0, 0, 0, 0, 0), f88 = c(0.11, 0, 0, 0, 0.01, 0), f89 = c(0.18,
0.13, 0.08, 0.14, 0.18, 0.12), f90 = c(0, 0, 0, 0, 0, 0),
f91 = c(0, 0, 0, 0, 0, 0), f92 = c(0, 0, 0, 0, 0, 0), f93 = c(0,
0, 0, 0, 0, 0), f94 = c(0.21, 0.02, 0.03, 0.11, 0.02, 0.07
), f95 = c(0.14, 0.02, 0.04, 0.14, 0.05, 0.07), f96 = c(0.64,
0.1, 0.09, 0.25, 0.06, 0.17), f97 = c(0, 0.05, 0.07, 0, 0.17,
0.02), f98 = c(0, 0, 0, 0, 0, 0), f99 = c(1.93, 2.16, 2.28,
2.32, 2.24, 2.38), f100 = c(0.5, 0.39, 0.23, 0.89, 0.22,
0.95), f101 = c(0, 0, 0, 0, 0, 0), f102 = c(0.14, 0.07, 0.04,
0.21, 0.07, 0.19), f103 = c(0, 0, 0, 0, 0, 0), f104 = c(0.02,
0, 0, 0.04, 0.02, 0.03), f105 = c(0.1, 0.01, 0, 0.07, 0.02,
0.04), f106 = c(0.11, 0.03, 0, 0.07, 0.01, 0.05), f107 = c(0,
0, 0.01, 0.04, 0.01, 0.02), f108 = c(0.36, 0.03, 0.01, 0.14,
0.02, 0.1), f109 = c(0, 0, 0, 0, 0, 0), f110 = c(0.11, 0.03,
0.01, 0.11, 0.02, 0.07)), .Names = c("label", "f1", "f2",
"f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12",
"f13", "f14", "f15", "f16", "f17", "f18", "ft19", "f20", "f21",
"f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30",
"f31", "f32", "f33", "f34", "f35", "f36", "f37", "f38", "f39",
"f40", "f41", "f42", "f43", "f44", "f45", "f46", "f47", "f48",
"f49", "f50", "f51", "f52", "f53", "f54", "ft55", "f56", "f57",
"f58", "f59", "f60", "f61", "f62", "f63", "f64", "f65", "f66",
"f67", "f68", "f69", "f70", "f71", "f72", "f73", "f74", "f75",
"f76", "f77", "f78", "f79", "f80", "f81", "f82", "f83", "f84",
"f85", "f86", "f87", "f88", "f89", "f90", "f91", "f92", "f93",
"f94", "f95", "f96", "f97", "f98", "f99", "f100", "f101", "f102",
"f103", "f104", "f105", "f106", "f107", "f108", "f109", "f110"
), class = "data.frame", row.names = c(NA, -6L))
答案 0 :(得分:3)
training_data[, !colSums(training_data == 0)]
基于问题更新:(过滤器应用于第1 - 99列)
idx <- which(as.logical(colSums(training_data[, 1:99] == 0))) # find columns
training_data[, setdiff(seq_along(test_data), idx)] # exclude columns
答案 1 :(得分:2)
您可以使用colSums
dat <- diag(10)
dat[1,1] <- 0
dat[5,5] <- 0
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0 0 0 0 0 0 0 0 0 0
[2,] 0 1 0 0 0 0 0 0 0 0
[3,] 0 0 1 0 0 0 0 0 0 0
[4,] 0 0 0 1 0 0 0 0 0 0
[5,] 0 0 0 0 0 0 0 0 0 0
[6,] 0 0 0 0 0 1 0 0 0 0
[7,] 0 0 0 0 0 0 1 0 0 0
[8,] 0 0 0 0 0 0 0 1 0 0
[9,] 0 0 0 0 0 0 0 0 1 0
[10,] 0 0 0 0 0 0 0 0 0 1
colSums(dat) == 0
TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
因此,要删除0列,只需执行此操作
dat[ ,colSums(dat)!=0]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0 0 0 0 0 0 0 0
[2,] 1 0 0 0 0 0 0 0
[3,] 0 1 0 0 0 0 0 0
[4,] 0 0 1 0 0 0 0 0
[5,] 0 0 0 0 0 0 0 0
[6,] 0 0 0 1 0 0 0 0
[7,] 0 0 0 0 1 0 0 0
[8,] 0 0 0 0 0 1 0 0
[9,] 0 0 0 0 0 0 1 0
[10,] 0 0 0 0 0 0 0 1
编辑
这假设所有数据具有相同的符号,以避免这种情况,
dat[ ,colSums(abs(dat[,1:99]))!=0]
答案 2 :(得分:2)
training_data[,apply(training_data, MARGIN = 2, FUN = function(x) !all(x == 0))]
答案 3 :(得分:2)
使用lapply
的另一种方式是data.frame
。 apply
内部将data.frame
转换为我认为的matrix
。
df[!unlist(lapply(df, function(x) all(x==0)))]
或者在你的情况下:
df[, 1:99][!unlist(lapply(df[, 1:99], function(x) all(x==0)))]
修改:使用colSums
的另一种方式。诀窍是在检查0
之后使用它。
df[!colSums(df == 0) == nrow(df)]
如果您知道哪些列是数字的(例如,1:99),请将df
替换为:
df[,1:99][!colSums(df[,1:99] == 0) == nrow(df)]
答案 4 :(得分:2)
我认为在使用all(x == 0)
的解决方案中使用any(x!=0)
稍微有点效率,因为any
在元素的第一个实例!=0
之后停止,这将随着行数的增加而变得很重要。
使用plyr
和colwise
(dat
作为dput
数据)提供不同的解决方案:
library(plyr)
f0 <- function(x) any(x!=0) & is.numeric(x)
colwise(identity, f0)(dat)
想法是遍历dat中的每一列并返回它(identity
),但仅当f0
返回TRUE
时,即该列至少有一个条目{{1 }和列!=0
修改强>:
为列表中的每个data.frame执行此操作,例如。 is.numeric
training_data <- list(dat, dat, dat, dat)
<强> EDIT2 强>: 保留标签栏:
training_data_clean <- lapply(training_data, function(z) colwise(identity, f0)(z))
sapply(training_data, dim)
[,1] [,2] [,3] [,4]
[1,] 6 6 6 6
[2,] 111 111 111 111
sapply(training_data_clean, dim)
[,1] [,2] [,3] [,4]
[1,] 6 6 6 6
[2,] 74 74 74 74
答案 5 :(得分:1)
apply(df, 2, Filter, f = function(x){!all(x==0)})
我有同样的question。