找到每列中不同范围内的值的数量

时间:2016-07-14 14:28:38

标签: r

我的数据看起来像这样

df <- structure(list(A = c(0.91971, 0.61566, 0.78723, 1.038, 0.65656, 
0.9448, NaN, 1.1353, 0.82117, 0.15673), RA = c(NaN, 10, NaN, 
200, NaN, 0.2, NaN, NaN, 30, NaN), B = c(100, 0.2, NaN, 400, 
NaN, NaN, 20, NaN, 3, NaN), CM = c(NaN, NaN, 77, NaN, 2, NaN, 
0.02, NaN, 0.8, 1), D = c(6, 5, NaN, NaN, NaN, 0.1, 0.5, NaN, 
NaN, NaN)), .Names = c("A", "RA", "B", "CM", "D"), row.names = c(NA, 
-10L), class = "data.frame")


#         A    RA     B    CM   D
#1  0.91971   NaN 100.0   NaN 6.0
#2  0.61566  10.0   0.2   NaN 5.0
#3  0.78723   NaN   NaN 77.00 NaN
#4  1.03800 200.0 400.0   NaN NaN
#5  0.65656   NaN   NaN  2.00 NaN
#6  0.94480   0.2   NaN   NaN 0.1
#7      NaN   NaN  20.0  0.02 0.5
#8  1.13530   NaN   NaN   NaN NaN
#9  0.82117  30.0   3.0  0.80 NaN
#10 0.15673   NaN   NaN  1.00 NaN

我想知道有多少元素存在:

  • 介于[0, 1)
  • 之间
  • 介于[1, 5)
  • 之间
  • 介于[5, 10)
  • 之间
  • >= 10

所以输出应该看起来像第一列

         ColumnA   columnRA    columnB    columnCM   columnD  
0 to 1    7           1          1         2            2
1 to 5    2           0          1         2            0
5 to 10   0           0          0         0            2
above 10  0           3          3         1            0

我尝试使用sapply,但我无法弄清楚如何做到这一点

count0-1 <-sapply(x, function(x) sum(length(which(x >0 & <1))))

3 个答案:

答案 0 :(得分:4)

您可以一次性完成此操作:

df2 <- sapply(df, function(x) {
    t(table(cut(x, 
                breaks = c(0,1,5,10,Inf), 
                right = F)))
    })

rownames(df2) <- c("0 to 1", "1 to 5", "between 5 and 10", "above 10")
colnames(df2) <- paste0("Column",colnames(df2))


                 ColumnA ColumnRA ColumnB ColumnCM ColumnD
0 to 1                 7        1       1        2       2
1 to 5                 2        0       1        2       0
between 5 and 10       0        0       0        0       2
above 10               0        3       3        1       0

<强>更新

正如@ m0h3n在评论中所建议的那样,使用apply会更好:

apply(df,2,function(x) table(
cut(x, 
    breaks = c(0,1,5,10,Inf), 
    right = F, 
    labels = c("0 to 1", "1 to 5", "between 5 and 10", "and above 10"))))

(已根据@ user2100721评论删除rownames

答案 1 :(得分:2)

另一种方法:

rng <- c(0, 1, 5, 10, Inf)
t(sapply(seq(head(rng,-1)), function(i) colSums(df>=rng[i] & df<rng[i+1], na.rm = T)))

     # A RA B CM D
# [1,] 7  1 1  2 2
# [2,] 2  0 1  2 0
# [3,] 0  0 0  0 2
# [4,] 0  3 3  1 0

这是一个简短的基准测试。可以看出,这个解决方案是最快的。

library(microbenchmark)

df <- structure(list(A = c(0.91971, 0.61566, 0.78723, 1.038, 0.65656, 
0.9448, NaN, 1.1353, 0.82117, 0.15673), RA = c(NaN, 10, NaN, 
200, NaN, 0.2, NaN, NaN, 30, NaN), B = c(100, 0.2, NaN, 400, 
NaN, NaN, 20, NaN, 3, NaN), CM = c(NaN, NaN, 77, NaN, 2, NaN, 
0.02, NaN, 0.8, 1), D = c(6, 5, NaN, NaN, NaN, 0.1, 0.5, NaN, 
NaN, NaN)), .Names = c("A", "RA", "B", "CM", "D"), row.names = c(NA, 
-10L), class = "data.frame")

f_Sumedh <- function(df){as.matrix(sapply(df, function(x) {t(table(cut(x, breaks = c(0,1,5,10,Inf), right = F)))}))}
f_m0h3n1 <- function(df){as.matrix(apply(df,2,function(x) table(cut(x, breaks = c(0,1,5,10,Inf), right = F, labels = c("0 to 1", "1 to 5", "between 5 and 10", "and above 10")))))}
f_m0h3n2 <- function(df){rng <- c(0, 1, 5, 10, Inf);t(sapply(seq(head(rng,-1)), function(i) colSums(df>=rng[i] & df<rng[i+1], na.rm = T)))}

r <- f_Sumedh(df)
all(r==f_m0h3n1(df))
# [1] TRUE
all(r==f_m0h3n2(df))
# [1] TRUE

microbenchmark(f_Sumedh(df), f_m0h3n1(df), f_m0h3n2(df))


# Unit: microseconds
         # expr     min       lq     mean   median      uq      max neval
 # f_Sumedh(df) 715.719 768.7520 826.6880 799.0985 837.859 1709.371   100
 # f_m0h3n1(df) 482.855 512.9015 565.0632 531.8310 578.554 1460.582   100
 # f_m0h3n2(df) 371.680 412.2440 460.9897 432.9770 473.240 1190.761   100

答案 2 :(得分:1)

也许这在某种程度上有帮助

apply(apply(df,2,cut,breaks = c(0,1,5,10,Inf),right = F),2,table)