在这篇文章中
select group before certain observations separated by grouping var in R with NA control,在使用一组add
na.rm=T
时有效。
但是新数据,其中三组
data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "x", class = "factor"),
x1 = c(0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), add1 = c(514L, 514L,
514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L,
514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L,
514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L,
514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L,
514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L
), group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female",
"male"), class = "factor"), add2 = c(2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L)), .Names = c("add", "x1", "add1",
"group", "add2"), class = "data.frame", row.names = c(NA, -52L
))
所以当我运行代码
library(tidyverse)
library( data.table)
data %>%
group_by(add,add1,add2) %>%
mutate(group2 = rleid(group)) %>%
group_by(add,add1,add2, group, group2) %>%
mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T), ## extra code here ##
Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>% ## extra code here ##
group_by(add,add1,add2) %>%
mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
ungroup() %>%
select(-group2) %>%
data.frame()
我遇到错误
Error in mutate_impl(.data, dots) :
Column `x1` must be length 24 (the group size) or one, not 0
PS。我只是提供了一个示例来给出数据结构,原因是有1000个组。我找不到群组 哪里有错误
如何解决此错误。
答案 0 :(得分:2)
如果我理解正确,则该错误是由第一部分(x1
)中所有 all NA
是group == 1L
的第一男性群体引起的。 / p>
恕我直言,一种更干净的方法是按照建议的here首先计算所有组的统计信息,并按照建议的here使用非等参连接来更新第二个男性组中的受影响行。
library( data.table)
grp_stats <- setDT(data)[, group2 :=rleid(group), by = .(add, add1, add2)][
group2 == 1L & group == "male",
.(group2 = 3L, mean = mean(x1, na.rm = TRUE), q25 = quantile(x1, 0.25, na.rm = TRUE)),
by = .(add, add1, add2)]
grp_stats
add add1 add2 group2 mean q25 1: x 514 2018 3 1.5 1.25 2: y 515 2018 3 NaN NA 3: z 516 2018 3 2.0 2.00
可以清楚地识别出产生错误统计信息的组。由OP决定从数据集中删除受影响的组。
但是,对于以后的联接,我们可以将它们保留在其中,因为它们不会产生任何影响。
具有常数值group2
的列3
已添加到组统计信息中,以简化后续的update in a non-equi join
:
data[, x1 := as.numeric(x1)][
grp_stats, on = .(group2, add, add1, add2, x1 > q25), x1 := mean][]
data
add x1 add1 group add2 group2 1: x 1.0 514 male 2018 1 2: x 2.0 514 male 2018 1 3: x NA 514 female 2018 2 4: x NA 514 female 2018 2 5: x 1.5 514 male 2018 3 6: x 1.0 514 male 2018 3 7: y NA 515 male 2018 1 8: y NA 515 male 2018 1 9: y NA 515 female 2018 2 10: y NA 515 female 2018 2 11: y 7.0 515 male 2018 3 12: y 1.0 515 male 2018 3 13: z 2.0 516 male 2018 1 14: z NA 516 male 2018 1 15: z NA 516 female 2018 2 16: z NA 516 female 2018 2 17: z 2.0 516 male 2018 3 18: z 1.0 516 male 2018 3
请注意,第5行和第17行已更新,而未触及产生错误统计信息的第二组行。
x1
被强制键入numeric
,然后再加入以匹配mean()
返回的结果的类型。
这里是由三组组成的样本数据。在第二个组中,第一个公节的所有x1
值均为NA
。
data <- data.table::fread("
add x1 add1 group add2
x 1 514 male 2018
x 2 514 male 2018
x NA 514 female 2018
x NA 514 female 2018
x 7 514 male 2018
x 1 514 male 2018
y NA 515 male 2018
y NA 515 male 2018
y NA 515 female 2018
y NA 515 female 2018
y 7 515 male 2018
y 1 515 male 2018
z 2 516 male 2018
z NA 516 male 2018
z NA 516 female 2018
z NA 516 female 2018
z 7 516 male 2018
z 1 516 male 2018
")
将上述示例数据集传递到OP的代码中后,我们可以重现错误消息:
library(dplyr)
data %>%
group_by(add,add1,add2) %>%
mutate(group2 = rleid(group)) %>%
group_by(add,add1,add2, group, group2) %>%
mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T), ## extra code here ##
Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>% ## extra code here ##
group_by(add,add1,add2) %>%
mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
ungroup() %>%
select(-group2) %>%
data.frame()
mutate_impl(.data,点)中的错误:
x1
列的长度必须为6(组大小)或1,而不是0