我的数据看起来像这样
df <- structure(list(A = c(0.91971, 0.61566, 0.78723, 1.038, 0.65656,
0.9448, NaN, 1.1353, 0.82117, 0.15673), RA = c(NaN, 10, NaN,
200, NaN, 0.2, NaN, NaN, 30, NaN), B = c(100, 0.2, NaN, 400,
NaN, NaN, 20, NaN, 3, NaN), CM = c(NaN, NaN, 77, NaN, 2, NaN,
0.02, NaN, 0.8, 1), D = c(6, 5, NaN, NaN, NaN, 0.1, 0.5, NaN,
NaN, NaN)), .Names = c("A", "RA", "B", "CM", "D"), row.names = c(NA,
-10L), class = "data.frame")
# A RA B CM D
#1 0.91971 NaN 100.0 NaN 6.0
#2 0.61566 10.0 0.2 NaN 5.0
#3 0.78723 NaN NaN 77.00 NaN
#4 1.03800 200.0 400.0 NaN NaN
#5 0.65656 NaN NaN 2.00 NaN
#6 0.94480 0.2 NaN NaN 0.1
#7 NaN NaN 20.0 0.02 0.5
#8 1.13530 NaN NaN NaN NaN
#9 0.82117 30.0 3.0 0.80 NaN
#10 0.15673 NaN NaN 1.00 NaN
我想知道有多少元素存在:
[0, 1)
[1, 5)
[5, 10)
>= 10
所以输出应该看起来像第一列
ColumnA columnRA columnB columnCM columnD
0 to 1 7 1 1 2 2
1 to 5 2 0 1 2 0
5 to 10 0 0 0 0 2
above 10 0 3 3 1 0
我尝试使用sapply
,但我无法弄清楚如何做到这一点
count0-1 <-sapply(x, function(x) sum(length(which(x >0 & <1))))
答案 0 :(得分:4)
您可以一次性完成此操作:
df2 <- sapply(df, function(x) {
t(table(cut(x,
breaks = c(0,1,5,10,Inf),
right = F)))
})
rownames(df2) <- c("0 to 1", "1 to 5", "between 5 and 10", "above 10")
colnames(df2) <- paste0("Column",colnames(df2))
ColumnA ColumnRA ColumnB ColumnCM ColumnD
0 to 1 7 1 1 2 2
1 to 5 2 0 1 2 0
between 5 and 10 0 0 0 0 2
above 10 0 3 3 1 0
<强>更新强>
正如@ m0h3n在评论中所建议的那样,使用apply
会更好:
apply(df,2,function(x) table(
cut(x,
breaks = c(0,1,5,10,Inf),
right = F,
labels = c("0 to 1", "1 to 5", "between 5 and 10", "and above 10"))))
(已根据@ user2100721评论删除rownames
答案 1 :(得分:2)
另一种方法:
rng <- c(0, 1, 5, 10, Inf)
t(sapply(seq(head(rng,-1)), function(i) colSums(df>=rng[i] & df<rng[i+1], na.rm = T)))
# A RA B CM D
# [1,] 7 1 1 2 2
# [2,] 2 0 1 2 0
# [3,] 0 0 0 0 2
# [4,] 0 3 3 1 0
这是一个简短的基准测试。可以看出,这个解决方案是最快的。
library(microbenchmark)
df <- structure(list(A = c(0.91971, 0.61566, 0.78723, 1.038, 0.65656,
0.9448, NaN, 1.1353, 0.82117, 0.15673), RA = c(NaN, 10, NaN,
200, NaN, 0.2, NaN, NaN, 30, NaN), B = c(100, 0.2, NaN, 400,
NaN, NaN, 20, NaN, 3, NaN), CM = c(NaN, NaN, 77, NaN, 2, NaN,
0.02, NaN, 0.8, 1), D = c(6, 5, NaN, NaN, NaN, 0.1, 0.5, NaN,
NaN, NaN)), .Names = c("A", "RA", "B", "CM", "D"), row.names = c(NA,
-10L), class = "data.frame")
f_Sumedh <- function(df){as.matrix(sapply(df, function(x) {t(table(cut(x, breaks = c(0,1,5,10,Inf), right = F)))}))}
f_m0h3n1 <- function(df){as.matrix(apply(df,2,function(x) table(cut(x, breaks = c(0,1,5,10,Inf), right = F, labels = c("0 to 1", "1 to 5", "between 5 and 10", "and above 10")))))}
f_m0h3n2 <- function(df){rng <- c(0, 1, 5, 10, Inf);t(sapply(seq(head(rng,-1)), function(i) colSums(df>=rng[i] & df<rng[i+1], na.rm = T)))}
r <- f_Sumedh(df)
all(r==f_m0h3n1(df))
# [1] TRUE
all(r==f_m0h3n2(df))
# [1] TRUE
microbenchmark(f_Sumedh(df), f_m0h3n1(df), f_m0h3n2(df))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_Sumedh(df) 715.719 768.7520 826.6880 799.0985 837.859 1709.371 100
# f_m0h3n1(df) 482.855 512.9015 565.0632 531.8310 578.554 1460.582 100
# f_m0h3n2(df) 371.680 412.2440 460.9897 432.9770 473.240 1190.761 100
答案 2 :(得分:1)
也许这在某种程度上有帮助
apply(apply(df,2,cut,breaks = c(0,1,5,10,Inf),right = F),2,table)