在R中满足条件时重置累计和

时间:2019-07-26 16:24:55

标签: r sum reset cumsum

所以我现在有一个看起来像这样的表:

data_wrong <- data.table(State = c("NY", "NY", "NY", "NY", "PA", "PA", "PA", 
"NJ", "NJ", "NJ"), Year = c("1973", "1974", "1975", "2005", "1992", "1993", 
"2001", "1930", "1931", "1932"), Consecutive_Yrs = c(1,2,3,1,1,6,1,1,9,10))

我希望它看起来像这样:

data <- data.table(State = c("NY", "NY", "NY", "NY", "PA", "PA", "PA", "NJ", 
"NJ", "NJ"), Year = c("1973", "1974", "1975", "2005", "1992", "1993", 
"2001", "1930", "1931", "1932"), Consecutive_Yrs = c(1,2,3,1,1,2,1,1,2,3))

这是我现在用来获取表格的代码:

data$diff <- NA

data <- data %>%
  group_by(State) %>%
  arrange(State) %>%
  mutate(diff = Year - lag(Year, default = first(Year)))

data$Consecutive_Yrs <- 1

data$Consecutive_Yrs <- ifelse(data$diff == 1, cumsum(data$Consecutive_Yrs), 
1)

任何帮助将不胜感激:)

2 个答案:

答案 0 :(得分:3)

它是data.table,可以选择使用data.table方法

library(data.table)
data_wrong[, grp := cumsum(c(TRUE, diff(as.numeric(Year)) > 1)), 
       .(State)][, Consecutive_Yrs := as.numeric(seq_len(.N)), .(State, grp)]
data_wrong
#    State Year Consecutive_Yrs grp
# 1:    NY 1973               1   1
# 2:    NY 1974               2   1
# 3:    NY 1975               3   1
# 4:    NY 2005               1   2
# 5:    PA 1992               1   1
# 6:    PA 1993               2   1
# 7:    PA 2001               1   2
# 8:    NJ 1930               1   1
# 9:    NJ 1931               2   1
#10:    NJ 1932               3   1

或使用rowid

data_wrong[, Consecutive_Yrs2 := rowid(rleid(as.numeric(Year) - 
        shift(as.numeric(Year), fill = as.numeric(Year[1])) >1)), .(State)]

答案 1 :(得分:1)

dplyr的可能性可能是:

data_wrong %>%
 group_by(State) %>%
 mutate(Year = as.numeric(Year),
        flag = Year - lag(Year, default = first(Year)) > 1) %>%
 group_by(State, flag = with(rle(flag), rep(seq_along(lengths), lengths))) %>%
 mutate(Consecutive_Yrs = seq_along(flag)) %>%
 ungroup() %>%
 select(-flag)

   State  Year Consecutive_Yrs
   <chr> <dbl>           <int>
 1 NY     1973               1
 2 NY     1974               2
 3 NY     1975               3
 4 NY     2005               1
 5 PA     1992               1
 6 PA     1993               2
 7 PA     2001               1
 8 NJ     1930               1
 9 NJ     1931               2
10 NJ     1932               3

或者:

data_wrong %>%
 group_by(State) %>%
 mutate(Year = as.numeric(Year),
        flag = Year - lag(Year, default = first(Year)) > 1) %>%
 group_by(State, flag = with(rle(flag), rep(seq_along(lengths), lengths))) %>%
 mutate(Consecutive_Yrs = row_number()) %>%
 ungroup() %>%
 select(-flag)