R:用于计算列智能统计数据的有效/可扩展性

时间:2017-02-01 01:46:00

标签: r dplyr

我需要使用以下信息计算列式统计数据:

  > library(dplyr)
  > Input <- data_frame(id=c(1,2,2,3,3,3),status=c(T,T,T,F,F,F),attri1=c(T,T,F,F,F,F), attri2=c(T,T,T,T,T,F))
  > Input
  Source: local data frame [6 x 4]

       id status attri1 attri2
    (dbl)  (lgl)  (lgl)  (lgl)
  1     1   TRUE   TRUE   TRUE
  2     2   TRUE   TRUE   TRUE
  3     2   TRUE  FALSE   TRUE
  4     3  FALSE  FALSE   TRUE
  5     3  FALSE  FALSE   TRUE
  6     3  FALSE  FALSE  FALSE

使用以下步骤生成输出。基本上,sTaT表示status ==T,相应的属性为T. sFaT表示status ==Fattribute == F。 sFaTuId基于sFaT并计算唯一ID。

  > Output <- data_frame(Attri=names(Input)[c(-1,-2)],sTaT=0,sFaT=0, sTaTuId=0)
  > for (as in Output$Attri){
         sTaT <- Input %>% filter_(as) %>% filter(status) %>% nrow()
         sFaT <- Input %>% filter_(as) %>% filter(!status) %>% nrow()
         sFaTuId <-  Input %>% filter_(as) %>% filter(!status) 
             %>%   select(id) %>% unique() %>% nrow()
         Output[Output$Attri==as,]$sTaT <- sTaT
         Output[Output$Attri==as,]$sFaT <- sFaT
         Output[Output$Attri==as,]$sFaTuId <- sFaTuId
         }

  > Output
  Source: local data frame [2 x 4]

     Attri  sTaT  sFaT sFaTuId
     (chr) (dbl) (dbl)   (dbl)
  1 attri1     2     0       0
  2 attri2     3     2       1

但是,当有许多行和属性列时,进程非常慢。有没有一种有效的方法来计算它?

2 个答案:

答案 0 :(得分:1)

我们可以通过将数据集转换为“长”格式(gather),按“Attri”分组并执行summarise

来实现
library(tidyr)
library(dplyr)
gather(Input, Attri, Val, attri1:attri2) %>% 
         group_by(Attri) %>% 
         summarise(sTatT = sum(status & Val), 
                   sFaT = sum(!status & Val), 
                   sFaTuId = n_distinct(id[!status & Val]))
# A tibble: 2 × 4
#   Attri sTatT  sFaT sFaTuId
#   <chr> <int> <int>   <int>
#1 attri1     2     0       0
#2 attri2     3     2       1

另一个选项是来自melt

data.table
library(data.table)
melt(setDT(Input), measure = patterns("^attri\\d+"),
   variable.name = "Attri")[,.(sTatT = sum(status & value),
    sFaT = sum(!status & value), sFaTuId = uniqueN(id[!status & value])) , .(Attri)]
#     Attri sTatT sFaT sFaTuId
#1: attri1     2    0       0
#2: attri2     3    2       1

答案 1 :(得分:0)

我发现doparallel是潜在的解决方案之一。

library(doParallel)
no_cores <- detectCores()-1
cl <- makeCluster(no_cores,type = "FORK")
registerDoParallel(cl)
calStats2 <- function (as, id, status){
  tmp <-  (as & status)
  sTaT <- tmp[tmp==TRUE] %>% length()

  tmp <- as & (!status)
  sFaT <- tmp[tmp==TRUE] %>% length()
  sTaTuId <-  id[as&(!status)==TRUE]  %>% unique()  %>% length()
  return(data.frame(c(sTaT,sFaT,sTaTuId)))
}
result <- foreach(i = 3:4, .combine = data.frame) %dopar%  calStats(Input[i], Input$id,Input$status)
names(result) <- names(Input)[c(-1,-2)]
result <- result %>% t()
colnames(result)
colnames(result)<- c("sTaT","sFaT","sTaTuId")
stopCluster(cl)