所以我现在有一个看起来像这样的表:
data_wrong <- data.table(State = c("NY", "NY", "NY", "NY", "PA", "PA", "PA",
"NJ", "NJ", "NJ"), Year = c("1973", "1974", "1975", "2005", "1992", "1993",
"2001", "1930", "1931", "1932"), Consecutive_Yrs = c(1,2,3,1,1,6,1,1,9,10))
我希望它看起来像这样:
data <- data.table(State = c("NY", "NY", "NY", "NY", "PA", "PA", "PA", "NJ",
"NJ", "NJ"), Year = c("1973", "1974", "1975", "2005", "1992", "1993",
"2001", "1930", "1931", "1932"), Consecutive_Yrs = c(1,2,3,1,1,2,1,1,2,3))
这是我现在用来获取表格的代码:
data$diff <- NA
data <- data %>%
group_by(State) %>%
arrange(State) %>%
mutate(diff = Year - lag(Year, default = first(Year)))
data$Consecutive_Yrs <- 1
data$Consecutive_Yrs <- ifelse(data$diff == 1, cumsum(data$Consecutive_Yrs),
1)
任何帮助将不胜感激:)
答案 0 :(得分:3)
它是data.table
,可以选择使用data.table
方法
library(data.table)
data_wrong[, grp := cumsum(c(TRUE, diff(as.numeric(Year)) > 1)),
.(State)][, Consecutive_Yrs := as.numeric(seq_len(.N)), .(State, grp)]
data_wrong
# State Year Consecutive_Yrs grp
# 1: NY 1973 1 1
# 2: NY 1974 2 1
# 3: NY 1975 3 1
# 4: NY 2005 1 2
# 5: PA 1992 1 1
# 6: PA 1993 2 1
# 7: PA 2001 1 2
# 8: NJ 1930 1 1
# 9: NJ 1931 2 1
#10: NJ 1932 3 1
或使用rowid
data_wrong[, Consecutive_Yrs2 := rowid(rleid(as.numeric(Year) -
shift(as.numeric(Year), fill = as.numeric(Year[1])) >1)), .(State)]
答案 1 :(得分:1)
dplyr
的可能性可能是:
data_wrong %>%
group_by(State) %>%
mutate(Year = as.numeric(Year),
flag = Year - lag(Year, default = first(Year)) > 1) %>%
group_by(State, flag = with(rle(flag), rep(seq_along(lengths), lengths))) %>%
mutate(Consecutive_Yrs = seq_along(flag)) %>%
ungroup() %>%
select(-flag)
State Year Consecutive_Yrs
<chr> <dbl> <int>
1 NY 1973 1
2 NY 1974 2
3 NY 1975 3
4 NY 2005 1
5 PA 1992 1
6 PA 1993 2
7 PA 2001 1
8 NJ 1930 1
9 NJ 1931 2
10 NJ 1932 3
或者:
data_wrong %>%
group_by(State) %>%
mutate(Year = as.numeric(Year),
flag = Year - lag(Year, default = first(Year)) > 1) %>%
group_by(State, flag = with(rle(flag), rep(seq_along(lengths), lengths))) %>%
mutate(Consecutive_Yrs = row_number()) %>%
ungroup() %>%
select(-flag)