下面是随机数据。
drop drop1 drop2 ch
15 14 40 1
20 15 45 1
35 16 90 1
40 17 70 0
25 18 80 0
30 18 90 0
11 20 100 0
13 36 11 0
16 70 220 0
19 40 440 1
25 45 1 1
35 30 70 1
40 40 230 1
17 11 170 1
30 2 160 1
我正在使用以下代码对R中的连续变量进行变量分析。
library(dplyr)
dt %>% mutate(dec=ntile(drop, n=2)) %>%
count(ch, dec) %>%
filter(ch == 1) -> datcbld
datcbld$N <- unclass(dt %>%
mutate(dec=ntile(drop, n=2)) %>%
count(dec) %>%
unname())[[2]]
datcbld$ch_perc <- datcbld$n / datcbld$N
datcbld$GreaterThan <- unclass(dt %>% mutate(dec=ntile(drop, n=2)) %>%
group_by(dec) %>%
summarise(min(drop)))[[2]]
datcbld$LessThan <- unclass(dt %>%
mutate(dec=ntile(drop, n=2)) %>%
group_by(dec) %>%
summarise(max(drop)))[[2]]
datcbld$Varname <- rep("dt", nrow(datcbld))
下面是代码的输出。
ch dec n N ch_perc GreaterThan LessThan Varname
1 1 4 8 0.5 11 25 drop
1 2 5 7 0.714285714 25 40 drop
当我将其用于单个变量时,此代码可以很好地工作。
当我尝试使用for循环为每列运行它时,无法使用每个十分位数的min和max进行汇总。
下面是我的代码,用于运行for循环。
finaldata <- data.frame()
for(i in 1:(ncol(dt) - 1)){
dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
count(ch,dec) %>%
filter(ch == 1) -> dat
dat$N <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
count(dec) %>%
unname())[[2]]
dat$ch_perc <- dat$n / dat$N
dat$GreaterThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(min(dt[, colnames(dt[i])])))[[2]]
dat$LessThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(max(dt[, colnames(dt[i])])))[[2]]
dat$Varname <- rep(colnames(dt[i]), nrow(dat))
finaldata <- rbind(finaldata, dat)
}
但是我无法获得相同的结果。
答案 0 :(得分:1)
我们可以通过遍历名称来使用map
来做到这一点,而无需断开链(%>%
)
library(tidyverse)
names(dt)[1:3] %>%
map_df(~
dt %>%
select(.x, ch) %>%
mutate(dec = ntile(!! rlang::sym(.x), n = 2)) %>%
group_by(dec) %>%
mutate(N = n(),
GreaterThan = max(!!rlang::sym(.x)),
LessThan = min(!!rlang::sym(.x))) %>%
select(-1) %>%
count(!!! rlang::syms(names(.))) %>%
filter(ch == 1)%>%
mutate(ch_perc = n/N,
Varname = .x))
# A tibble: 6 x 8
# Groups: dec [2]
# dec ch N GreaterThan LessThan n ch_perc Varname
# <int> <int> <int> <dbl> <dbl> <int> <dbl> <chr>
#1 1 1 8 25 11 4 0.5 drop
#2 2 1 7 40 25 5 0.714 drop
#3 1 1 8 18 2 5 0.625 drop1
#4 2 1 7 70 20 4 0.571 drop1
#5 1 1 8 90 1 5 0.625 drop2
#6 2 1 7 440 90 4 0.571 drop2
OP的for
循环中的问题是使用
dt[, colnames(dt[i])]
在summarise
之内。它将在整个列值上应用min
或max
,而不是在按结构分组的列上应用函数
我们可以将列名转换为上面显示的符号(sym
)并进行评估或使用summarise_at
finaldata <- data.frame()
for(i in 1:(ncol(dt) - 1)){
dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
count(ch,dec) %>%
filter(ch == 1) -> dat
dat$N <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
count(dec) %>%
unname())[[2]]
dat$ch_perc <- dat$n / dat$N
dat$GreaterThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(max(!! rlang::sym(names(dt)[i]))))[[2]]
dat$LessThan <- unclass(dt %>%
mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
group_by(dec) %>%
summarise(min(!! rlang::sym(names(dt)[i]))))[[2]]
dat$Varname <- rep(colnames(dt[i]), nrow(dat))
finaldata <- rbind(finaldata, dat)
}
finaldata
# A tibble: 6 x 8
# ch dec n N ch_perc GreaterThan LessThan Varname
# <int> <int> <int> <int> <dbl> <dbl> <dbl> <chr>
#1 1 1 4 8 0.5 25 11 drop
#2 1 2 5 7 0.714 40 25 drop
#3 1 1 5 8 0.625 18 2 drop1
#4 1 2 4 7 0.571 70 20 drop1
#5 1 1 5 8 0.625 90 1 drop2
#6 1 2 4 7 0.571 440 90 drop2
dt <- structure(list(drop = c(15L, 20L, 35L, 40L, 25L, 30L, 11L, 13L,
16L, 19L, 25L, 35L, 40L, 17L, 30L), drop1 = c(14L, 15L, 16L,
17L, 18L, 18L, 20L, 36L, 70L, 40L, 45L, 30L, 40L, 11L, 2L), drop2 = c(40L,
45L, 90L, 70L, 80L, 90L, 100L, 11L, 220L, 440L, 1L, 70L, 230L,
170L, 160L), ch = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("drop", "drop1", "drop2", "ch"),
class = "data.frame", row.names = c(NA,
-15L))