过滤掉R中的多个列

时间:2013-03-05 12:00:49

标签: r dataframe

假设一个包含多个行和列的数据集,其中一些列为0(我的意思是列中的所有值都是0)。如何筛选出这些列?我尝试使用以下代码但无济于事。

training_data <- Filer(function(x) { !(all(x[, 1:99]==0))}, training_data)

更新:

对不起。在数据集中,并非所有列都是数字,因此我需要为列指定1:99的范围。

更新V2: 添加了我的数据集的一部分(使用dput

structure(list(label = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("A", 
"B"), class = "factor"), f1 = c(15, 24, 10, 9, 6, 9), f2 = c(6, 
14, 5, 4, 2, 4), f3 = c(6, 7, 2, 2, 1, 2), f4 = c(0, 0, 0, 0, 
0, 0), f5 = c(9, 15, 6, 5, 3, 5), f6 = c(3, 7, 2, 2, 1, 2), f7 = c(1, 
0, 0, 0, 0, 0), f8 = c(4, 11, 5, 4, 2, 4), f9 = c(5, 3, 0, 0, 
0, 0), f10 = c(1, 3, 0, 0, 0, 0), f11 = c(1, 4, 2, 2, 1, 2), 
    f12 = c(0, 0, 0, 0, 0, 0), f13 = c(13, 15, 7, 6, 3, 6), f14 = c(0, 
    7, 1, 1, 1, 1), f15 = c(0, 0, 0, 0, 0, 0), f16 = c(20, 30, 
    11, 10, 6, 10), f17 = c(5, 0, 0, 0, 0, 0), f18 = c(0, 0, 
    0, 0, 0, 0), ft19 = c(28, 344, 399, 28, 82, 42), f20 = c(2.15, 
    15.64, 49.88, 4, 20.5, 6), f21 = c(0, 0, 0, 0, 0, 0), f22 = c(0, 
    0, 0, 0, 0, 0), f23 = c(6, 7, 2, 2, 1, 2), f24 = c(0, 0, 
    0, 0, 0, 0), f25 = c(19, 334, 395, 23, 79, 37), f26 = c(0, 
    26, 37, 6, 16, 7), f27 = c(11, 64, 101, 5, 17, 12), f28 = c(0, 
    0, 0, 0, 0, 0), f29 = c(2, 37, 101, 7, 26, 8), f30 = c(0, 
    18, 32, 2, 16, 4), f31 = c(0, 0, 0, 0, 0, 0), f32 = c(0, 
    0, 0, 0, 0, 0), f33 = c(3, 0, 1, 0, 1, 0), f34 = c(5, 44, 
    32, 4, 15, 5), f35 = c(0, 0, 0, 0, 0, 0), f36 = c(0, 0, 0, 
    0, 0, 0), f37 = c(0, 0, 0, 0, 0, 0), f38 = c(0, 0, 0, 0, 
    0, 0), f39 = c(6, 8, 10, 3, 2, 3), f40 = c(4, 6, 16, 4, 4, 
    3), f41 = c(18, 36, 37, 7, 5, 7), f42 = c(0, 18, 27, 0, 14, 
    1), f43 = c(0, 0, 0, 0, 0, 0), f44 = c(54, 743, 910, 65, 
    184, 100), f45 = c(14, 133, 91, 25, 18, 40), f46 = c(0, 0, 
    0, 0, 0, 0), f47 = c(4, 25, 17, 6, 6, 8), f48 = c(0, 0, 0, 
    0, 0, 0), f49 = c(0.46, 1, 1.5, 1.14, 1.5, 1.14), f50 = c(2.67, 
    1.86, 1.83, 1.88, 1.67, 1.88), f51 = c(3, 9, 1, 2, 1, 2), 
    f52 = c(0, 1, 2, 1, 1, 1), f53 = c(10, 12, 5, 4, 2, 4), f54 = c(0, 
    0, 0, 0, 0, 0), ft55 = c(3, 10, 3, 3, 2, 3), f56 = c(0.54, 
    0.07, 0.03, 0.32, 0.07, 0.21), f57 = c(0.21, 0.04, 0.01, 
    0.14, 0.02, 0.1), f58 = c(0.21, 0.02, 0.01, 0.07, 0.01, 0.05
    ), f59 = c(0, 0, 0, 0, 0, 0), f60 = c(0.32, 0.04, 0.02, 0.18, 
    0.04, 0.12), f61 = c(0.11, 0.02, 0.01, 0.07, 0.01, 0.05), 
    f62 = c(0.04, 0, 0, 0, 0, 0), f63 = c(0.14, 0.03, 0.01, 0.14, 
    0.02, 0.1), f64 = c(0.18, 0.01, 0, 0, 0, 0), f65 = c(0.04, 
    0.01, 0, 0, 0, 0), f66 = c(0.04, 0.01, 0.01, 0.07, 0.01, 
    0.05), f67 = c(0, 0, 0, 0, 0, 0), f68 = c(0.46, 0.04, 0.02, 
    0.21, 0.04, 0.14), f69 = c(0, 0.02, 0, 0.04, 0.01, 0.02), 
    f70 = c(0, 0, 0, 0, 0, 0), f71 = c(0.71, 0.09, 0.03, 0.36, 
    0.07, 0.24), f72 = c(0.18, 0, 0, 0, 0, 0), f73 = c(0, 0, 
    0, 0, 0, 0), f74 = c(1, 1, 1, 1, 1, 1), f75 = c(0.08, 0.05, 
    0.12, 0.14, 0.25, 0.14), f76 = c(0, 0, 0, 0, 0, 0), f77 = c(0, 
    0, 0, 0, 0, 0), f78 = c(0.21, 0.02, 0.01, 0.07, 0.01, 0.05
    ), f79 = c(0, 0, 0, 0, 0, 0), f80 = c(0.68, 0.97, 0.99, 0.82, 
    0.96, 0.88), f81 = c(0, 0.08, 0.09, 0.21, 0.2, 0.17), f82 = c(0.39, 
    0.19, 0.25, 0.18, 0.21, 0.29), f83 = c(0, 0, 0, 0, 0, 0), 
    f84 = c(0.07, 0.11, 0.25, 0.25, 0.32, 0.19), f85 = c(0, 0.05, 
    0.08, 0.07, 0.2, 0.1), f86 = c(0, 0, 0, 0, 0, 0), f87 = c(0, 
    0, 0, 0, 0, 0), f88 = c(0.11, 0, 0, 0, 0.01, 0), f89 = c(0.18, 
    0.13, 0.08, 0.14, 0.18, 0.12), f90 = c(0, 0, 0, 0, 0, 0), 
    f91 = c(0, 0, 0, 0, 0, 0), f92 = c(0, 0, 0, 0, 0, 0), f93 = c(0, 
    0, 0, 0, 0, 0), f94 = c(0.21, 0.02, 0.03, 0.11, 0.02, 0.07
    ), f95 = c(0.14, 0.02, 0.04, 0.14, 0.05, 0.07), f96 = c(0.64, 
    0.1, 0.09, 0.25, 0.06, 0.17), f97 = c(0, 0.05, 0.07, 0, 0.17, 
    0.02), f98 = c(0, 0, 0, 0, 0, 0), f99 = c(1.93, 2.16, 2.28, 
    2.32, 2.24, 2.38), f100 = c(0.5, 0.39, 0.23, 0.89, 0.22, 
    0.95), f101 = c(0, 0, 0, 0, 0, 0), f102 = c(0.14, 0.07, 0.04, 
    0.21, 0.07, 0.19), f103 = c(0, 0, 0, 0, 0, 0), f104 = c(0.02, 
    0, 0, 0.04, 0.02, 0.03), f105 = c(0.1, 0.01, 0, 0.07, 0.02, 
    0.04), f106 = c(0.11, 0.03, 0, 0.07, 0.01, 0.05), f107 = c(0, 
    0, 0.01, 0.04, 0.01, 0.02), f108 = c(0.36, 0.03, 0.01, 0.14, 
    0.02, 0.1), f109 = c(0, 0, 0, 0, 0, 0), f110 = c(0.11, 0.03, 
    0.01, 0.11, 0.02, 0.07)), .Names = c("label", "f1", "f2", 
"f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", 
"f13", "f14", "f15", "f16", "f17", "f18", "ft19", "f20", "f21", 
"f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30", 
"f31", "f32", "f33", "f34", "f35", "f36", "f37", "f38", "f39", 
"f40", "f41", "f42", "f43", "f44", "f45", "f46", "f47", "f48", 
"f49", "f50", "f51", "f52", "f53", "f54", "ft55", "f56", "f57", 
"f58", "f59", "f60", "f61", "f62", "f63", "f64", "f65", "f66", 
"f67", "f68", "f69", "f70", "f71", "f72", "f73", "f74", "f75", 
"f76", "f77", "f78", "f79", "f80", "f81", "f82", "f83", "f84", 
"f85", "f86", "f87", "f88", "f89", "f90", "f91", "f92", "f93", 
"f94", "f95", "f96", "f97", "f98", "f99", "f100", "f101", "f102", 
"f103", "f104", "f105", "f106", "f107", "f108", "f109", "f110"
), class = "data.frame", row.names = c(NA, -6L))

6 个答案:

答案 0 :(得分:3)

training_data[, !colSums(training_data == 0)]

基于问题更新:(过滤器应用于第1 - 99列)

idx <- which(as.logical(colSums(training_data[, 1:99] == 0))) # find columns
training_data[, setdiff(seq_along(test_data), idx)]           # exclude columns

答案 1 :(得分:2)

您可以使用colSums

dat <- diag(10)
dat[1,1]  <- 0
dat[5,5]  <- 0

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,]    0    0    0    0    0    0    0    0    0     0
 [2,]    0    1    0    0    0    0    0    0    0     0
 [3,]    0    0    1    0    0    0    0    0    0     0
 [4,]    0    0    0    1    0    0    0    0    0     0
 [5,]    0    0    0    0    0    0    0    0    0     0
 [6,]    0    0    0    0    0    1    0    0    0     0
 [7,]    0    0    0    0    0    0    1    0    0     0
 [8,]    0    0    0    0    0    0    0    1    0     0
 [9,]    0    0    0    0    0    0    0    0    1     0
[10,]    0    0    0    0    0    0    0    0    0     1

colSums(dat) == 0
 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE

因此,要删除0列,只需执行此操作

dat[  ,colSums(dat)!=0]
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
 [1,]    0    0    0    0    0    0    0    0
 [2,]    1    0    0    0    0    0    0    0
 [3,]    0    1    0    0    0    0    0    0
 [4,]    0    0    1    0    0    0    0    0
 [5,]    0    0    0    0    0    0    0    0
 [6,]    0    0    0    1    0    0    0    0
 [7,]    0    0    0    0    1    0    0    0
 [8,]    0    0    0    0    0    1    0    0
 [9,]    0    0    0    0    0    0    1    0
[10,]    0    0    0    0    0    0    0    1

编辑

这假设所有数据具有相同的符号,以避免这种情况,

dat[  ,colSums(abs(dat[,1:99]))!=0]

答案 2 :(得分:2)

training_data[,apply(training_data, MARGIN = 2, FUN = function(x) !all(x == 0))]

答案 3 :(得分:2)

使用lapply的另一种方式是data.frameapply内部将data.frame转换为我认为的matrix

df[!unlist(lapply(df, function(x) all(x==0)))]

或者在你的情况下:

df[, 1:99][!unlist(lapply(df[, 1:99], function(x) all(x==0)))]

修改:使用colSums的另一种方式。诀窍是在检查0之后使用它。

df[!colSums(df == 0) == nrow(df)]

如果您知道哪些列是数字的(例如,1:99),请将df替换为:

df[,1:99][!colSums(df[,1:99] == 0) == nrow(df)]

答案 4 :(得分:2)

我认为在使用all(x == 0)的解决方案中使用any(x!=0)稍微有点效率,因为any在元素的第一个实例!=0之后停止,这将随着行数的增加而变得很重要。

使用plyrcolwisedat作为dput数据)提供不同的解决方案:

library(plyr)
f0 <- function(x) any(x!=0) & is.numeric(x)
colwise(identity, f0)(dat)

想法是遍历dat中的每一列并返回它(identity),但仅当f0返回TRUE时,即该列至少有一个条目{{1 }和列!=0

修改: 为列表中的每个data.frame执行此操作,例如。 is.numeric

training_data <- list(dat, dat, dat, dat)

<强> EDIT2 : 保留标签栏:

training_data_clean <- lapply(training_data, function(z) colwise(identity, f0)(z))

sapply(training_data, dim)
     [,1] [,2] [,3] [,4]
[1,]    6    6    6    6
[2,]  111  111  111  111

sapply(training_data_clean, dim)
     [,1] [,2] [,3] [,4]
[1,]    6    6    6    6
[2,]   74   74   74   74

答案 5 :(得分:1)

apply(df, 2, Filter, f = function(x){!all(x==0)})

我有同样的question