Question

下面是随机数据。

drop    drop1   drop2   ch
15  14  40  1
20  15  45  1
35  16  90  1
40  17  70  0
25  18  80  0
30  18  90  0
11  20  100 0
13  36  11  0
16  70  220 0
19  40  440 1
25  45  1   1
35  30  70  1
40  40  230 1
17  11  170 1
30  2   160 1

我正在使用以下代码对R中的连续变量进行变量分析。

library(dplyr)
dt %>% mutate(dec=ntile(drop, n=2)) %>% 
  count(ch, dec) %>%
  filter(ch == 1) -> datcbld

datcbld$N <- unclass(dt %>% 
                       mutate(dec=ntile(drop, n=2)) %>%
                       count(dec) %>% 
                       unname())[[2]]
datcbld$ch_perc <- datcbld$n / datcbld$N
datcbld$GreaterThan <- unclass(dt %>% mutate(dec=ntile(drop, n=2)) %>%
                                 group_by(dec) %>% 
                                 summarise(min(drop)))[[2]]
datcbld$LessThan <- unclass(dt %>% 
                              mutate(dec=ntile(drop, n=2)) %>% 
                              group_by(dec) %>% 
                              summarise(max(drop)))[[2]]
datcbld$Varname <- rep("dt", nrow(datcbld))

下面是代码的输出。

ch  dec n   N   ch_perc GreaterThan LessThan    Varname
1   1   4   8   0.5 11  25  drop
1   2   5   7   0.714285714 25  40  drop

当我将其用于单个变量时，此代码可以很好地工作。

当我尝试使用for循环为每列运行它时，无法使用每个十分位数的min和max进行汇总。

下面是我的代码，用于运行for循环。

finaldata <- data.frame()

for(i in 1:(ncol(dt) - 1)){
  dt %>% 
    mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
    count(ch,dec) %>%
    filter(ch == 1) -> dat
  dat$N <- unclass(dt %>% 
                     mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                     count(dec) %>%
                     unname())[[2]]
  dat$ch_perc <- dat$n / dat$N
  dat$GreaterThan <- unclass(dt %>% 
                               mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                               group_by(dec) %>%
                               summarise(min(dt[, colnames(dt[i])])))[[2]]
  dat$LessThan <- unclass(dt %>%
                            mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                            group_by(dec) %>% 
                            summarise(max(dt[, colnames(dt[i])])))[[2]]
  dat$Varname <- rep(colnames(dt[i]), nrow(dat))
  finaldata <- rbind(finaldata, dat)
}

但是我无法获得相同的结果。

Answer 1

我们可以通过遍历名称来使用map来做到这一点，而无需断开链（%>%）

library(tidyverse)
names(dt)[1:3] %>% 
     map_df(~
           dt %>% 
            select(.x, ch) %>% 
            mutate(dec = ntile(!! rlang::sym(.x), n = 2)) %>% 
            group_by(dec) %>% 
            mutate(N = n(), 
                   GreaterThan = max(!!rlang::sym(.x)), 
                   LessThan = min(!!rlang::sym(.x))) %>% 
            select(-1) %>% 
            count(!!! rlang::syms(names(.))) %>%               
            filter(ch == 1)%>% 
            mutate(ch_perc = n/N,
                   Varname = .x)) 
# A tibble: 6 x 8
# Groups:   dec [2]
#    dec    ch     N GreaterThan LessThan     n ch_perc Varname  
#  <int> <int> <int>       <dbl>    <dbl> <int>   <dbl> <chr>
#1     1     1     8          25       11     4   0.5   drop 
#2     2     1     7          40       25     5   0.714 drop 
#3     1     1     8          18        2     5   0.625 drop1
#4     2     1     7          70       20     4   0.571 drop1
#5     1     1     8          90        1     5   0.625 drop2
#6     2     1     7         440       90     4   0.571 drop2

OP的for循环中的问题是使用

dt[, colnames(dt[i])]

在summarise之内。它将在整个列值上应用min或max，而不是在按结构分组的列上应用函数

我们可以将列名转换为上面显示的符号（sym）并进行评估或使用summarise_at

finaldata <- data.frame()                          
 for(i in 1:(ncol(dt) - 1)){
  dt %>% 
    mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
    count(ch,dec) %>%
    filter(ch == 1) -> dat
  dat$N <- unclass(dt %>% 
                     mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                     count(dec) %>%
                     unname())[[2]]
  dat$ch_perc <- dat$n / dat$N
  dat$GreaterThan <- unclass(dt %>% 
                               mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                               group_by(dec) %>%
                               summarise(max(!! rlang::sym(names(dt)[i]))))[[2]]

  dat$LessThan <- unclass(dt %>%
                            mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                            group_by(dec) %>% 
                            summarise(min(!! rlang::sym(names(dt)[i]))))[[2]]

  dat$Varname <- rep(colnames(dt[i]), nrow(dat))
  finaldata <- rbind(finaldata, dat)
}

finaldata
# A tibble: 6 x 8
#     ch   dec     n     N ch_perc GreaterThan LessThan Varname
#  <int> <int> <int> <int>   <dbl>       <dbl>    <dbl> <chr>  
#1     1     1     4     8   0.5            25       11 drop   
#2     1     2     5     7   0.714          40       25 drop   
#3     1     1     5     8   0.625          18        2 drop1  
#4     1     2     4     7   0.571          70       20 drop1  
#5     1     1     5     8   0.625          90        1 drop2  
#6     1     2     4     7   0.571         440       90 drop2

数据

dt <- structure(list(drop = c(15L, 20L, 35L, 40L, 25L, 30L, 11L, 13L, 
16L, 19L, 25L, 35L, 40L, 17L, 30L), drop1 = c(14L, 15L, 16L, 
17L, 18L, 18L, 20L, 36L, 70L, 40L, 45L, 30L, 40L, 11L, 2L), drop2 = c(40L, 
45L, 90L, 70L, 80L, 90L, 100L, 11L, 220L, 440L, 1L, 70L, 230L, 
170L, 160L), ch = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 1L, 1L, 1L)), .Names = c("drop", "drop1", "drop2", "ch"),
 class = "data.frame", row.names = c(NA, 
-15L))

使用for循环时无法汇总最小值和最大值

1 个答案:

数据