我需要使用以下信息计算列式统计数据:
> library(dplyr)
> Input <- data_frame(id=c(1,2,2,3,3,3),status=c(T,T,T,F,F,F),attri1=c(T,T,F,F,F,F), attri2=c(T,T,T,T,T,F))
> Input
Source: local data frame [6 x 4]
id status attri1 attri2
(dbl) (lgl) (lgl) (lgl)
1 1 TRUE TRUE TRUE
2 2 TRUE TRUE TRUE
3 2 TRUE FALSE TRUE
4 3 FALSE FALSE TRUE
5 3 FALSE FALSE TRUE
6 3 FALSE FALSE FALSE
使用以下步骤生成输出。基本上,sTaT表示status ==T
,相应的属性为T. sFaT表示status ==F
和attribute == F
。 sFaTuId基于sFaT并计算唯一ID。
> Output <- data_frame(Attri=names(Input)[c(-1,-2)],sTaT=0,sFaT=0, sTaTuId=0)
> for (as in Output$Attri){
sTaT <- Input %>% filter_(as) %>% filter(status) %>% nrow()
sFaT <- Input %>% filter_(as) %>% filter(!status) %>% nrow()
sFaTuId <- Input %>% filter_(as) %>% filter(!status)
%>% select(id) %>% unique() %>% nrow()
Output[Output$Attri==as,]$sTaT <- sTaT
Output[Output$Attri==as,]$sFaT <- sFaT
Output[Output$Attri==as,]$sFaTuId <- sFaTuId
}
> Output
Source: local data frame [2 x 4]
Attri sTaT sFaT sFaTuId
(chr) (dbl) (dbl) (dbl)
1 attri1 2 0 0
2 attri2 3 2 1
但是,当有许多行和属性列时,进程非常慢。有没有一种有效的方法来计算它?
答案 0 :(得分:1)
我们可以通过将数据集转换为“长”格式(gather
),按“Attri”分组并执行summarise
library(tidyr)
library(dplyr)
gather(Input, Attri, Val, attri1:attri2) %>%
group_by(Attri) %>%
summarise(sTatT = sum(status & Val),
sFaT = sum(!status & Val),
sFaTuId = n_distinct(id[!status & Val]))
# A tibble: 2 × 4
# Attri sTatT sFaT sFaTuId
# <chr> <int> <int> <int>
#1 attri1 2 0 0
#2 attri2 3 2 1
另一个选项是来自melt
data.table
library(data.table)
melt(setDT(Input), measure = patterns("^attri\\d+"),
variable.name = "Attri")[,.(sTatT = sum(status & value),
sFaT = sum(!status & value), sFaTuId = uniqueN(id[!status & value])) , .(Attri)]
# Attri sTatT sFaT sFaTuId
#1: attri1 2 0 0
#2: attri2 3 2 1
答案 1 :(得分:0)
我发现doparallel是潜在的解决方案之一。
library(doParallel)
no_cores <- detectCores()-1
cl <- makeCluster(no_cores,type = "FORK")
registerDoParallel(cl)
calStats2 <- function (as, id, status){
tmp <- (as & status)
sTaT <- tmp[tmp==TRUE] %>% length()
tmp <- as & (!status)
sFaT <- tmp[tmp==TRUE] %>% length()
sTaTuId <- id[as&(!status)==TRUE] %>% unique() %>% length()
return(data.frame(c(sTaT,sFaT,sTaTuId)))
}
result <- foreach(i = 3:4, .combine = data.frame) %dopar% calStats(Input[i], Input$id,Input$status)
names(result) <- names(Input)[c(-1,-2)]
result <- result %>% t()
colnames(result)
colnames(result)<- c("sTaT","sFaT","sTaTuId")
stopCluster(cl)