我在创建一个根据P列分组并计算所有" YES"对于已保存在列表中的列,例如" list_col"。我很确定我会使用apply系列中的函数,但不确定如何按特定列进行分组(在这种情况下为Col P)
P <- as.character(c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"))
a <- as.character(c(NA,"Yes","Yes",NA,NA,NA,"Yes","Yes","Yes",NA))
b <- as.character(c(NA,"Yes",NA,NA,"Yes",NA,"Yes","Yes",NA,NA))
c <- as.character(c(NA,NA,NA,NA,"Yes",NA,"Yes",NA,NA,NA))
df_sample <- as.data.frame(cbind(P, a, b, c))
df_sample
list_col <- c("a","b","c")
理想情况下,我会根据上面的示例数据框和下面的答案寻找以下答案,并根据&#34; list_col&#34;
更改列P a b c
A 2 2 1
B 3 2 1
任何帮助将不胜感激
答案 0 :(得分:3)
以下是dplyr
,
library(dplyr)
df_sample %>%
group_by(P) %>%
select(list_col) %>%
summarise_all(funs(sum(. == 'Yes', na.rm = TRUE)))
#Adding missing grouping variables: `P`
# A tibble: 2 x 4
# P a b c
# <fctr> <int> <int> <int>
#1 A 2 2 1
#2 B 3 2 1
答案 1 :(得分:2)
在data.table
:
library(data.table)
list_col <- c("a","b","c")
setDT(df_sample)[, (lapply(.SD, function(x) sum(x=="Yes", na.rm = TRUE))), by = P, .SDcols = list_col]
# P a b c
#1: A 2 2 1
#2: B 3 2 1
或者,基本R解决方案仍使用lapply
:
res <-lapply(split(df_sample[,list_col], df_sample$P), function(x) colSums(x=="Yes", na.rm = TRUE))
do.call(rbind, res)
# a b c
#A 2 2 1
#B 3 2 1
为了它的价值,我的机器上有一个微基准测试:
microbenchmark::microbenchmark(splitlapply = do.call(rbind, lapply(split(df_sample[,list_col], df_sample$P), function(x) colSums(x=="Yes", na.rm = TRUE))),
+ dt = sampleDT[, (lapply(.SD, function(x) sum(x=="Yes", na.rm = TRUE))), by = P, .SDcols = list_col])
Unit: microseconds
expr min lq mean median uq max neval
splitlapply 455.841 505.0715 546.6699 529.3225 561.2315 889.436 100
dt 861.722 1052.9920 1114.2752 1111.7040 1166.7695 1707.761 100
答案 2 :(得分:1)
使用melt
reshape
library(reshape)
df=na.omit(melt(df_sample,id.vars='P'))
table(df$P,df$variable)
a b c
A 2 2 1
B 3 2 1
答案 3 :(得分:0)
df_sample <- as.data.frame(cbind(P, a, b, c), stringsAsFactors = FALSE)
for (i in list_col){
df_r <- df_sample[, i] == 'Yes' & !is.na(df_sample[, i])
df_sample[df_r, i] <- df_sample$P[df_r]
}
sapply(df_sample[, list_col], table)