我的数据包含numeric
和非numeric
列,如下所示:
mydt
vnum1 vint1 vfac1 vch1
1: -0.30159484 8 3 E
2: -0.09833430 8 1 D
3: -2.15963282 1 3 D
4: 0.03904374 5 2 B
5: 1.54928970 4 1 C
6: -0.73873654 5 1 A
7: -0.68594479 9 2 B
8: 1.35765612 1 2 E
9: 1.46958351 2 1 B
10: -0.89623979 2 4 E
如何仅选择numeric
列并使用mean
计算data.table
?
我试过这个:
mydt[ , lapply(.SD, mean), ]
# vnum1 vint1 vfac1 vch1
# 1: -0.046491 4.5 NA NA
# Warning messages:
# 1: In mean.default(X[[3L]], ...) :
# argument is not numeric or logical: returning NA
# 2: In mean.default(X[[4L]], ...) :
# argument is not numeric or logical: returning NA
dput(mydt)
structure(list(vnum1 = c(-0.301594844692861, -0.0983343040483769,
-2.15963282153076, 0.03904374068617, 1.54928969700272, -0.738736535236348,
-0.685944791146016, 1.35765612481877, 1.46958350568506, -0.896239790653183
), vint1 = c(8L, 8L, 1L, 5L, 4L, 5L, 9L, 1L, 2L, 2L), vfac1 = structure(c(3L,
1L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 4L), .Label = c("1", "2", "3",
"4"), class = "factor"), vch1 = structure(c(5L, 4L, 4L, 2L, 3L,
1L, 2L, 5L, 2L, 5L), .Label = c("A", "B", "C", "D", "E"), class = "factor")), .Names = c("vnum1",
"vint1", "vfac1", "vch1"), class = c("data.table", "data.frame"
), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x991c070>)
我也尝试了以下方法,但它不起作用:
mydt[ , lapply(.SD, is.numeric),]
# vnum1 vint1 vfac1 vch1
# 1: TRUE TRUE FALSE FALSE
mydt[,mydt[,lapply(.SD, is.numeric),]]
# vnum1 vint1 vfac1 vch1
# 1: TRUE TRUE FALSE FALSE
mydt[ , mydt[ , lapply(.SD, is.numeric) , ], with = F]
# Error in Math.data.frame(j) :
# non-numeric variable in data frame: vnum1vint1vfac1vch1
mydt[ , c(mydt[ , lapply(.SD, is.numeric)), ], with = F]
# Error: unexpected ')' in "mydt[,c(mydt[,lapply(.SD, is.numeric))"
正如@Arun所建议的那样,我尝试过以下但无法获得一个子集:
xx <- mydt[ , lapply(.SD, is.numeric), ]
xx
# vnum1 vint1 vfac1 vch1
# 1: TRUE TRUE FALSE FALSE
mydt[ , lapply(.SD, mean), .SDcols = xx]
# Error in `[.data.table`(mydt, , lapply(.SD, mean), .SDcols = xx) :
# .SDcols should be column numbers or names
根据@David的建议,我尝试了以下内容,但非数字列的值为NULL
。我想获得mydt的子集,以便甚至不列出其他列。
mydt[ , lapply(.SD, function(x) if(is.numeric(x)) mean(x))]
# vnum1 vint1 vfac1 vch1
# 1: -0.046491 4.5 NULL NULL
我缺少data.frame:
sapply(mydf, is.numeric)
# vnum1 vint1 vfac1 vch1
# TRUE TRUE FALSE FALSE
mydf[sapply(mydf, is.numeric)]
# vnum1 vint1
#1 -0.30159484 8
#2 -0.09833430 8
#3 -2.15963282 1
#4 0.03904374 5
#5 1.54928970 4
#6 -0.73873654 5
#7 -0.68594479 9
#8 1.35765612 1
#9 1.46958351 2
#10 -0.89623979 2
sapply(mydf[sapply(mydf, is.numeric)], mean)
# vnum1 vint1
#-0.046491 4.500000
行。感谢David的评论,以下作品:
mydt[, sapply(mydt, is.numeric), with = FALSE][,sapply(.SD, mean),]
# vnum1 vint1
# -0.046491 4.500000
mydt[, sapply(mydt, is.numeric), with = FALSE]
# vnum1 vint1
# 1: -0.30159484 8
# 2: -0.09833430 8
# 3: -2.15963282 1
# 4: 0.03904374 5
# ...
答案 0 :(得分:10)
在SO上搜索.SDcols
,我登上this answer,我认为很好地解释了如何使用它。
cols = sapply(mydt, is.numeric)
cols = names(cols)[cols]
mydt[, lapply(.SD, mean), .SDcols = cols]
# vnum1 vint1
# 1: -0.046491 4.5
执行mydt[, sapply(mydt, is.numeric), with = FALSE]
并不是那么有效,因为它子集你的data.table与那些列并且会产生(深层)副本 - 不必要地使用更多内存。
使用colMeans
将data.table强制转换为matrix
,这对内存效率的影响不大。
答案 1 :(得分:1)
您可以在一行中使用以下格式而无需使用 sapply
:
mydt[, lapply(.SD, mean), .SDcols = is.numeric]
此外,如果您正在处理真实数据,那么您的数据很有可能 na 值。以下是 NA 情况下的工作方式:
mydt[, lapply(.SD, function(i) mean(i, na.rm = T)), .SDcols = is.numeric]
答案 2 :(得分:1)
我遇到了同样的问题,下面的代码也可能有帮助。
data("mtcars")
mtcars$X1 <- factor(mtcars$gear, levels = c(4,3,5)); str(mtcars) #create an non numeric column X1
my.mean <- function(x){ if(is.numeric(x)) c(mean(x), median(x))}
my.df <- setNames(as.data.frame(unlist(lapply(mtcars, FUN = my.mean))), "values"); my.df
my.df$names <- rep(c("mean","median"), times = length(row.names(my.df))/2); my.df
my.df$variables <- rownames(my.df); my.df
library(stringr)
my.df$variables <- str_remove(my.df$variables, "[12]"); my.df
data_wide <- spread(my.df, names, values)
data_wide
> data_wide
variables mean median
1 am 0.406250 0.000
2 carb 2.812500 2.000
3 cyl 6.187500 6.000
4 disp 230.721875 196.300
5 drat 3.596563 3.695
6 gear 3.687500 4.000
7 hp 146.687500 123.000
8 mpg 20.090625 19.200
9 qsec 17.848750 17.710
10 vs 0.437500 0.000
11 wt 3.217250 3.325