总结使用dplyr和for循环

时间:2019-11-03 04:47:04

标签: r for-loop group-by dplyr summarize

我想在for循环上使用dplyr来总结每个独立变量(列)和目标变量。这是我的主要数据框:

  contract_ID    Asurion         Variable_1     Variable_2  Variable_3
         1          Y                a               c          f
         2          Y                a               d          g
         3          N                b               c          g
         4          N                a               d          f
         5          Y                b               c          f
         6          Y                a               d          f

分组之后,我得到

a1 <- a %>% 
  group_by(Asurion,BhvrBnk_Donates_to_Env_Causes) %>%       
  summarise(counT=n_distinct(CONTRACT_ID)) %>%                                        
  mutate(perc=paste0(round(counT/sum(counT)*100,2),"%"))

 Asurion Variable_1 CounT   perc
    Y         a        3     75%
    Y         b        1     25%
    N         a        1     50%
    N         b        1     50%

我想对存在于我的数据框中的每个变量进行这种汇总,并且我想使用for循环来实现。如何获得理想的结果

这是我尝试使用的方法,但似乎不起作用。它用于学校项目,为此我需要使用for循环。请在这里帮助我

categorical <- colnames(a)###where categroical is the names of all columns in a  
###I would like to have a for loop for every column in a and summarise in the following way. I would like to store each of the summarisations in a separate dataframe 

for (i in categorical) {
  a[[i]] <- a %>% 
     group_by(Asurion,get(i)) %>% 
    summarise(counT=n_distinct(CONTRACT_ID)) %>% 
    mutate(perc=paste0(round(counT/sum(counT)*100,2),"%"))
  }

3 个答案:

答案 0 :(得分:2)

您可能并不需要for loop来获得所需的东西。

df<-data.frame(contract_ID = 1:6, 
               Asurion = c("Y", "Y", "N", "N", "Y", "Y"), 
               Variable_1 = c("a", "a", "b", "a", "b","a"), 
               Variable_2 = c("c", "d", "c", "d", "c", "d"), 
               Variable_3 = c("f", "g", "g", "f", "f", "f"))

pct <- function(x) {
  df %>% 
  group_by(Asurion, {{x}}) %>% 
  summarise(counT=n_distinct(contract_ID)) %>% 
  mutate(perc = paste0(round(counT/sum(counT)*100,2),"%"))
}

pct(Variable_1)
pct(Variable_2)
pct(Variable_3)

如果确实有很多变量,则可以使用诸如for loopapply之类的方法来迭代最后一位。 这是一个选项:

categorical<- df[3:5]
a <- list()
j = 1
for (i in categorical) {
  a[[j]] <- df %>% 
    group_by(Asurion, {{i}}) %>% 
    summarise(counT=n_distinct(contract_ID)) %>% 
    mutate(perc = paste0(round(counT/sum(counT)*100,2),"%"))
  j = j + 1
}
a
[[1]]
# A tibble: 4 x 4
# Groups:   Asurion [2]
  Asurion `<fct>` counT perc 
  <fct>   <fct>   <int> <chr>
1 N       a           1 50%  
2 N       b           1 50%  
3 Y       a           3 75%  
4 Y       b           1 25%  

[[2]]
# A tibble: 4 x 4
# Groups:   Asurion [2]
  Asurion `<fct>` counT perc 
  <fct>   <fct>   <int> <chr>
1 N       c           1 50%  
2 N       d           1 50%  
3 Y       c           2 50%  
4 Y       d           2 50%  

[[3]]
# A tibble: 4 x 4
# Groups:   Asurion [2]
  Asurion `<fct>` counT perc 
  <fct>   <fct>   <int> <chr>
1 N       f           1 50%  
2 N       g           1 50%  
3 Y       f           3 75%  
4 Y       g           1 25%  

编辑 根据您的问题添加变量名称作为新的变量值,以识别group_by变量。

categorical<- df[3:5]
vnames <- colnames(categorical)
a <- list()
j = 1
for (i in categorical) {
  a[[j]] <- df %>% 
    group_by(Asurion, {{i}}) %>% 
    summarise(counT=n_distinct(contract_ID)) %>% 
    mutate(perc = paste0(round(counT/sum(counT)*100,2),"%"))
    a[[j]]$vnames = vnames[j]
  j = j + 1
}
a

答案 1 :(得分:1)

Base R解决方案:

 df2 <- data.frame(

  reshape(df,

          direction = "long",

          varying = names(df)[!(names(df) %in% c("contract_ID", "Asurion"))],

          v.names = "Val",

          timevar = "Variable",

          times = names(df)[!(names(df) %in% c("contract_ID", "Asurion"))]

  ),

  row.names = NULL,

  stringsAsFactors = F

)

# Count the unique contract ids within the specified group: 

df2$CounT <- as.numeric(ave(df2$contract_ID,

                            paste(df2$Asurion, df2$Variable, df2$Val, sep = "_"),

                            FUN = function(x){length(unique(x))}))

# Create the percentage of total counts: 

df2$perc <- paste0(round((df2$CounT/as.numeric(ave(df2$Variable, 

                                                   paste(df2$Variable, df2$Val, sep = "_"),

                                                   FUN = length))) * 100,2),"%")

# Allocate some memory for list of dataframes: 

df_list <- vector("list", length(unique(df2$Variable)))

# Store the summary dataframes in the list: 

df_list <- lapply(split(df2, df2$Variable),

                  function(x){x <- unique(x[,c(!(names(x) %in% c("id", "contract_ID")))])})

# Push the dataframes from the list into the global environment: 

list2env(df_list, .GlobalEnv)

Tidyverse解决方案:

require(tidyverse)

# Allocate some memory for list of dataframes: 

df_list <- vector("list", length(unique(names(df)[grepl("Variable_", names(df))])))

# Tidyverse summary: 

df_list <- 

  df %>% 

  gather(Variable, Value, -contract_ID, -Asurion) %>% 

  group_by(Asurion, Variable, Value) %>% 

  mutate(CounT = length(unique(contract_ID))) %>%

  ungroup() %>% 

  group_by(Variable, Value) %>% 

  mutate(perc = paste0(round((CounT/n()) * 100, 2), "%")) %>% 

  ungroup() %>% 

  select(-contract_ID) %>% 

  distinct() %>% 

  split(., .$Variable)

# Push the dataframes from the list into the global environment: 

list2env(df_list, .GlobalEnv)

数据:

structure(list(contract_ID = 1:6, Asurion = c("Y", "Y", "N", 
"N", "Y", "Y"), Variable_1 = c("a", "a", "b", "a", "b", "a"), 
    Variable_2 = c("c", "d", "c", "d", "c", "d"), Variable_3 = c("f", 
    "g", "g", "f", "f", "f")), class = "data.frame", row.names = c(NA, 
-6L))

答案 2 :(得分:1)

这是一种的方式,可根据您的问题生成结果列表:

library(tidyr)
library(dplyr)

DF%>%
  add_count(Asurion, name = 'all_n')%>%
  pivot_longer(cols = starts_with('Variable'))%>%
  group_by(Asurion, name, value)%>%
  summarize(CounT = n(), 
            perc = n() / first(all_n))%>%
  ungroup()%>%
  group_split(name, keep = F)

[[1]]
# A tibble: 4 x 4
  Asurion value CounT  perc
  <fct>   <fct> <int> <dbl>
1 N       a         1  0.5 
2 N       b         1  0.5 
3 Y       a         3  0.75
4 Y       b         1  0.25

[[2]]
# A tibble: 4 x 4
  Asurion value CounT  perc
  <fct>   <fct> <int> <dbl>
1 N       c         1   0.5
2 N       d         1   0.5
3 Y       c         2   0.5
4 Y       d         2   0.5

[[3]]
# A tibble: 4 x 4
  Asurion value CounT  perc
  <fct>   <fct> <int> <dbl>
1 N       f         1  0.5 
2 N       g         1  0.5 
3 Y       f         3  0.75
4 Y       g         1  0.25

还有一个更好地匹配预期输出的基本解决方案:

## base
lapply(grep('Variable', names(DF), value = T), # get vars starting with "Variable"
       function(col_name){
         t = table(DF[, c('Asurion', col_name)]) 
         data.frame(prop.table(t, 1), CounT = c(t))
       }
)

[[1]]
  Asurion Variable_1 Freq CounT
1       N          a 0.50     1
2       Y          a 0.75     3
3       N          b 0.50     1
4       Y          b 0.25     1

[[2]]
  Asurion Variable_2 Freq CounT
1       N          c  0.5     1
2       Y          c  0.5     2
3       N          d  0.5     1
4       Y          d  0.5     2

[[3]]
  Asurion Variable_3 Freq CounT
1       N          f 0.50     1
2       Y          f 0.75     3
3       N          g 0.50     1
4       Y          g 0.25     1

每个@Zhiqiang Wang的数据:

DF<-data.frame(contract_ID = 1:6, 
               Asurion = c("Y", "Y", "N", "N", "Y", "Y"), 
               Variable_1 = c("a", "a", "b", "a", "b","a"), 
               Variable_2 = c("c", "d", "c", "d", "c", "d"), 
               Variable_3 = c("f", "g", "g", "f", "f", "f"))