说
data=structure(list(x1 = c(88L, 88L, 94L, 82L, 68L, 72L, 43L, 84L,
65L, 91L, 65L, 80L, 82L, 63L, 67L, 58L, 100L, 32L, 75L, 66L,
30L, 12L, 97L, 58L, 14L, 64L), group = structure(c(2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", "male"), class = "factor")), .Names = c("x1",
"group"), class = "data.frame", row.names = c(NA, -26L))
此数据中存在组变量(性别(男性和女性) 我需要统计平均数和所有男性在女性之前的25%。男的,女的,我不动。也是女性,我不碰。 因此作为输出
x1 group mean 25%
88 male 76,36 66,5
88 male 76,36 66,5
94 male 76,36 66,5
82 male 76,36 66,5
68 male 76,36 66,5
72 male 76,36 66,5
43 male 76,36 66,5
84 male 76,36 66,5
65 male 76,36 66,5
91 male 76,36 66,5
65 male 76,36 66,5
80 female
82 female
63 female
67 female
58 female
100 female
32 female
75 male
66 male
30 male
12 male
97 male
58 male
14 male
64 male
如何做到?
x1 group
88 male
88 male
94 male
82 male
68 male
72 male
43 male
84 male
65 male
91 male
65 male
80 female
82 female
63 female
67 female
58 female
100 female
32 female
**76,36 male
**76,36 male
30 male
12 male
**76,36 male
58 male
14 male
64 male
这里结果。
答案 0 :(得分:4)
在data.table
中,您可以编辑rleid(group) == 1
所在的行,即第一行,按group
的值分组。
library(data.table)
setDT(df)
df[rleid(group) == 1, `:=`(mean = mean(x1), Q25 = quantile(x1, 0.25))]
结果
# x1 group mean Q25
# 1: 88 male 76.36364 66.5
# 2: 88 male 76.36364 66.5
# 3: 94 male 76.36364 66.5
# 4: 82 male 76.36364 66.5
# 5: 68 male 76.36364 66.5
# 6: 72 male 76.36364 66.5
# 7: 43 male 76.36364 66.5
# 8: 84 male 76.36364 66.5
# 9: 65 male 76.36364 66.5
# 10: 91 male 76.36364 66.5
# 11: 65 male 76.36364 66.5
# 12: 80 female NA NA
# 13: 82 female NA NA
# 14: 63 female NA NA
# 15: 67 female NA NA
# 16: 58 female NA NA
# 17: 100 female NA NA
# 18: 32 female NA NA
# 19: 75 male NA NA
# 20: 66 male NA NA
# 21: 30 male NA NA
# 22: 12 male NA NA
# 23: 97 male NA NA
# 24: 58 male NA NA
# 25: 14 male NA NA
# 26: 64 male NA NA
# x1 group mean Q25
答案 1 :(得分:2)
library(dplyr)
library(data.table)
data %>%
group_by(group, group2 = rleid(group)) %>% # group by gender and it's position
mutate(MEAN = mean(x1[group=="male" & group2==1]), # calculate metrics only for male in position 1
Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
ungroup() %>% # ungroup
select(-group2) %>% # remove column
data.frame() # only for visualisation purposes
# x1 group MEAN Q25
# 1 88 male 76.36364 66.5
# 2 88 male 76.36364 66.5
# 3 94 male 76.36364 66.5
# 4 82 male 76.36364 66.5
# 5 68 male 76.36364 66.5
# 6 72 male 76.36364 66.5
# 7 43 male 76.36364 66.5
# 8 84 male 76.36364 66.5
# 9 65 male 76.36364 66.5
# 10 91 male 76.36364 66.5
# 11 65 male 76.36364 66.5
# 12 80 female NaN NA
# 13 82 female NaN NA
# 14 63 female NaN NA
# 15 67 female NaN NA
# 16 58 female NaN NA
# 17 100 female NaN NA
# 18 32 female NaN NA
# 19 75 male NaN NA
# 20 66 male NaN NA
# 21 30 male NaN NA
# 22 12 male NaN NA
# 23 97 male NaN NA
# 24 58 male NaN NA
# 25 14 male NaN NA
# 26 64 male NaN NA
要根据您提到的逻辑更新x1
列,可以使用以下方法:
data %>%
group_by(group, group2 = rleid(group)) %>%
mutate(MEAN = mean(x1[group=="male" & group2==1]),
Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
ungroup() %>%
mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1)) %>%
ungroup() %>%
select(-group2) %>%
data.frame()
# x1 group MEAN Q25
# 1 88.00000 male 76.36364 66.5
# 2 88.00000 male 76.36364 66.5
# 3 94.00000 male 76.36364 66.5
# 4 82.00000 male 76.36364 66.5
# 5 68.00000 male 76.36364 66.5
# 6 72.00000 male 76.36364 66.5
# 7 43.00000 male 76.36364 66.5
# 8 84.00000 male 76.36364 66.5
# 9 65.00000 male 76.36364 66.5
# 10 91.00000 male 76.36364 66.5
# 11 65.00000 male 76.36364 66.5
# 12 80.00000 female NaN NA
# 13 82.00000 female NaN NA
# 14 63.00000 female NaN NA
# 15 67.00000 female NaN NA
# 16 58.00000 female NaN NA
# 17 100.00000 female NaN NA
# 18 32.00000 female NaN NA
# 19 76.36364 male NaN NA
# 20 66.00000 male NaN NA
# 21 30.00000 male NaN NA
# 22 12.00000 male NaN NA
# 23 76.36364 male NaN NA
# 24 58.00000 male NaN NA
# 25 14.00000 male NaN NA
# 26 64.00000 male NaN NA
我添加的额外代码(mutate
仅针对男性,之后是女性(x1
x1`大于分位数)来更新group2 = 3') and only if
。
答案 2 :(得分:0)
这是另一种dplyr
方法,它按rleid()
组进行汇总,并使用left_join()
附加结果列:
library(dplyr)
result <- data %>%
group_by(rleid = data.table::rleid(group)) %>%
left_join(., filter(., rleid == 1) %>%
summarise(mean = mean(x1), q25 = quantile(x1, 0.25))
) %>%
ungroup() %>%
select(-rleid)
result %>%
print(n = Inf) # make sure to print all rows
# A tibble: 26 x 4 x1 group mean q25 <int> <fct> <dbl> <dbl> 1 88 male 76.4 66.5 2 88 male 76.4 66.5 3 94 male 76.4 66.5 4 82 male 76.4 66.5 5 68 male 76.4 66.5 6 72 male 76.4 66.5 7 43 male 76.4 66.5 8 84 male 76.4 66.5 9 65 male 76.4 66.5 10 91 male 76.4 66.5 11 65 male 76.4 66.5 12 80 female NA NA 13 82 female NA NA 14 63 female NA NA 15 67 female NA NA 16 58 female NA NA 17 100 female NA NA 18 32 female NA NA 19 75 male NA NA 20 66 male NA NA 21 30 male NA NA 22 12 male NA NA 23 97 male NA NA 24 58 male NA NA 25 14 male NA NA 26 64 male NA NA
请注意,除非结果被分配回data
,否则data
不会被修改。
答案 3 :(得分:0)
这也是另一种data.table
方法,可以回答OP的原始问题以及OP在评论here和here中提出的其他问题。
对于两个问题,我们都需要计算第一组男性的总量,然后通过第一个问题的 update join 通过引用更新data
和第二个问题的 update non-equi join 。
library(data.table)
# coerce to data.table, append rleid for later joins
setDT(data)[, rleid := rleid(group)][
# ensure that x1 has the same type as mean(x1)
, x1 := as.double(x1)]
agg <- data[rleid == 1, .(mean(x1), quantile(x1, .25)), by = rleid]
agg
rleid V1 V2 1: 1 76.36364 66.5
这是通过更新联接来实现的
data[agg, on = "rleid", c("mean", "q25") := .(V1, V2)]
data[]
x1 group rleid mean q25 1: 88 male 1 76.36364 66.5 2: 88 male 1 76.36364 66.5 3: 94 male 1 76.36364 66.5 4: 82 male 1 76.36364 66.5 5: 68 male 1 76.36364 66.5 6: 72 male 1 76.36364 66.5 7: 43 male 1 76.36364 66.5 8: 84 male 1 76.36364 66.5 9: 65 male 1 76.36364 66.5 10: 91 male 1 76.36364 66.5 11: 65 male 1 76.36364 66.5 12: 80 female 2 NA NA 13: 82 female 2 NA NA 14: 63 female 2 NA NA 15: 67 female 2 NA NA 16: 58 female 2 NA NA 17: 100 female 2 NA NA 18: 32 female 2 NA NA 19: 75 male 3 NA NA 20: 66 male 3 NA NA 21: 30 male 3 NA NA 22: 12 male 3 NA NA 23: 97 male 3 NA NA 24: 58 male 3 NA NA 25: 14 male 3 NA NA 26: 64 male 3 NA NA x1 group rleid mean q25
请注意,data
已通过引用进行更新,即未复制。
OP请求替换第二位男性组中的任何x1
值,该值超过为第一位男性q25 >按第一个男性组计算的平均值分组。请注意,第二男性组由rleid == 3L
标识,因为女性组介于两者之间。
这可以通过 update non-equi join 实现。连接条件仅选择属于rleid == 3L
且x1
较大q25
的那些行。
data[agg[, .(rleid = 3, V1, V2)], on = .(rleid, x1 > V2), x1 := V1][]
# remove helper column no longer needed
data[, rleid := NULL]
data[]
x1 group mean q25 1: 88.00000 male 76.36364 66.5 2: 88.00000 male 76.36364 66.5 3: 94.00000 male 76.36364 66.5 4: 82.00000 male 76.36364 66.5 5: 68.00000 male 76.36364 66.5 6: 72.00000 male 76.36364 66.5 7: 43.00000 male 76.36364 66.5 8: 84.00000 male 76.36364 66.5 9: 65.00000 male 76.36364 66.5 10: 91.00000 male 76.36364 66.5 11: 65.00000 male 76.36364 66.5 12: 80.00000 female NA NA 13: 82.00000 female NA NA 14: 63.00000 female NA NA 15: 67.00000 female NA NA 16: 58.00000 female NA NA 17: 100.00000 female NA NA 18: 32.00000 female NA NA 19: 76.36364 male NA NA 20: 66.00000 male NA NA 21: 30.00000 male NA NA 22: 12.00000 male NA NA 23: 76.36364 male NA NA 24: 58.00000 male NA NA 25: 14.00000 male NA NA 26: 64.00000 male NA NA x1 group mean q25
请注意,第19和23行已根据要求进行了更新。同样,data
通过引用进行更新。