有一个名为PERNO的组,每个组如果在行i col2 ==“ a”中,我想将i + 1的行添加到col3组的末尾,以
PERNO col2 col3
1 b 3
1 d 3
1 a 4
1 d 5
2 v 2
2 a 3
2 a 4
2 x 4
2 h 5
输出
PERNO col2 col3
1 b 3
1 d 3
1 a 4
1 d 6
2 v 2
2 a 3
2 a 4
2 x 5
2 h 6
在第一组中,col3的第四行加上一个,因为它在col2 == a的下一行
第二组的最后2行也加了一个
真实数据:
str(df)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 153008 obs. of 3 variables:
$ PERNO: num 1 1 1 1 1 1 1 1 2 2 ...
$ loop : num 1 2 2 2 2 2 2 2 1 2 ...
$ TPURP: Factor w/ 23 levels "(1) Working at home (for pay)",..: 2 3 2 13 13 13 15 2 2 13 ...
dput(df)
structure(list(PERNO = c(1, 1, 1, 1, 1, 1), loop = c(2, 1, 2,
2, 2, 2), TPURP = structure(c(2L, 2L, 22L, 15L, 15L, 15L), .Label = c("(1) Working at home (for pay)",
"(2) All other home activities", "(3) Work/Job", "(4) All other activities at work",
"(5) Attending class", "(6) All other activities at school",
"(7) Change type of transportation/transfer", "(8) Dropped off passenger",
"(9) Picked up passenger", "(10) Other, specify - transportation",
"(11) Work/Business related", "(12) Service Private Vehicle",
"(13) Routine Shopping", "(14) Shopping for major purchases",
"(15) Household errands", "(16) Personal Business", "(17) Eat meal outside of home",
"(18) Health care", "(19) Civic/Religious activities", "(20) Recreation/Entertainment",
"(21) Visit friends/relative", "(24) Loop trip", "(97) Other, specify"
), class = "factor")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
我们在哪里
loop==col3, TPURP==col2 "a"== (24) Loop trip
答案 0 :(得分:2)
按“ PERNO”分组后,获得逻辑表达式(cummax
)col2 == 'a'
,返回其中的lag
,其中指定n为“ a”元素的数量'col2',然后将(+
)添加为'col3'
library(dplyr)
df1 %>%
group_by(PERNO) %>%
mutate(col3 = col3 +lag(cummax(col2 == 'a'), n = sum(col2 == "a"), default = 0))
# A tibble: 9 x 3
# Groups: PERNO [2]
# PERNO col2 col3
# <int> <chr> <dbl>
#1 1 b 3
#2 1 d 3
#3 1 a 4
#4 1 d 6
#5 2 v 2
#6 2 a 3
#7 2 a 4
#8 2 x 5
#9 2 h 6
另一种选择是找到最后一个出现的“ a”的位置,使用case_when
在该位置之后加1。
df1 %>%
group_by(PERNO) %>%
mutate(col3 = if('a' %in% col2) case_when(row_number() >
tail(which(col2 == 'a'), 1) ~
col3 + 1L, TRUE ~ col3 ) else col3)
或避免使用if/else
情况的pmax
df1 %>%
group_by(PERNO) %>%
mutate(col3 = case_when(row_number() > pmin(n(),
tail(which(col2 == 'a'), 1)[1], na.rm = TRUE) ~ col3 + 1L,
TRUE ~ col3))
或使用data.table
library(data.table)
i1 <- setDT(df1)[, .I[.I > tail(.I[col2 == 'a'], 1)], PERNO]$V1
df1[i1, col3 := col3 + 1L]
使用OP的新数据集
df %>%
mutate_if(is.factor, as.character) %>%
group_by(PERNO) %>%
mutate(loop = case_when(row_number() > pmin(n(),
tail(which(TPURP == "(24) Loop trip"), 1)[1], na.rm = TRUE) ~ loop + 1,
TRUE ~ loop))
# A tibble: 6 x 3
# Groups: PERNO [1]
# PERNO loop TPURP
# <dbl> <dbl> <chr>
#1 1 2 (2) All other home activities
#2 1 1 (2) All other home activities
#3 1 2 (24) Loop trip
#4 1 3 (15) Household errands
#5 1 3 (15) Household errands
#6 1 3 (15) Household errands
df1 <- structure(list(PERNO = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
col2 = c("b", "d", "a", "d", "v", "a", "a", "x", "h"), col3 = c(3L,
3L, 4L, 5L, 2L, 3L, 4L, 4L, 5L)), class = "data.frame", row.names = c(NA,
-9L))
df9 %>%
group_by(PERNO) %>%
summarise(Sum = sum(grepl("(24) Loop trip", TPURP, fixed = TRUE)))
给予
PERNO Sum
<dbl> <int>
1 1 483
2 2 268
3 3 60
4 4 39
5 5 16
6 6 11
7 7 0
8 8 0
如果我们用OP的完整数据检查输出
fileN <- 'df.csv'
df <- read.csv(fileN, stringsAsFactors = FALSE)
out <- df %>%
group_by(PERNO) %>%
mutate(loop = case_when(row_number() > pmin(n(),
tail(which(TPURP == "(24) Loop trip"), 1)[1],
na.rm = TRUE) ~ loop + 1L,
TRUE ~ loop))
-检查“ loop”列中字符串的“ last”匹配项中第一个“ PERNO”的输出
df %>%
filter(PERNO == 1) %>%
select(TPURP, loop) %>%
filter(row_number() >= tail(which(TPURP == "(24) Loop trip"), 1)[1]) %>%
pull(loop) %>%
head(10)
#[1] 2 2 2 2 1 2 2 1 2 2
out %>%
ungroup %>%
filter(PERNO == 1) %>%
select(TPURP, loop) %>%
filter(row_number() >= tail(which(TPURP == "(24) Loop trip"), 1)[1]) %>%
pull(loop) %>%
head(10)
#[1] 2 3 3 3 2 3 3 2 3 3
注意,这些值相加1