如何将2列的值加1到列的值?

时间:2019-09-29 18:27:36

标签: r dataframe

有一个名为PERNO的组,每个组如果在行i col2 ==“ a”中,我想将i + 1的行添加到col3组的末尾,以

      PERNO     col2      col3
        1         b         3
        1         d         3
        1         a         4
        1         d         5
        2         v         2
        2         a         3
        2         a         4
        2         x         4
        2         h         5

输出

      PERNO     col2      col3
        1         b         3
        1         d         3
        1         a         4
        1         d         6
        2         v         2
        2         a         3
        2         a         4
        2         x         5
        2         h         6

在第一组中,col3的第四行加上一个,因为它在col2 == a的下一行

第二组的最后2行也加了一个

真实数据:

str(df)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   153008 obs. of  3 variables:
 $ PERNO: num  1 1 1 1 1 1 1 1 2 2 ...
 $ loop : num  1 2 2 2 2 2 2 2 1 2 ...
 $ TPURP: Factor w/ 23 levels "(1) Working at home (for pay)",..: 2 3 2 13 13 13 15 2 2 13 ...

dput(df)

structure(list(PERNO = c(1, 1, 1, 1, 1, 1), loop = c(2, 1, 2, 
2, 2, 2), TPURP = structure(c(2L, 2L, 22L, 15L, 15L, 15L), .Label = c("(1) Working at home (for pay)", 
"(2) All other home activities", "(3) Work/Job", "(4) All other activities at work", 
"(5) Attending class", "(6) All other activities at school", 
"(7) Change type of transportation/transfer", "(8) Dropped off passenger", 
"(9) Picked up passenger", "(10) Other, specify - transportation", 
"(11) Work/Business related", "(12) Service Private Vehicle", 
"(13) Routine Shopping", "(14) Shopping for major purchases", 
"(15) Household errands", "(16) Personal Business", "(17) Eat meal outside of home", 
"(18) Health care", "(19) Civic/Religious activities", "(20) Recreation/Entertainment", 
"(21) Visit friends/relative", "(24) Loop trip", "(97) Other, specify"
), class = "factor")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))

我们在哪里

loop==col3,     TPURP==col2    "a"== (24) Loop trip

1 个答案:

答案 0 :(得分:2)

按“ PERNO”分组后,获得逻辑表达式(cummaxcol2 == 'a',返回其中的lag,其中指定n为“ a”元素的数量'col2',然后将(+)添加为'col3'

library(dplyr)
df1 %>%
   group_by(PERNO) %>%
   mutate(col3 = col3 +lag(cummax(col2 == 'a'), n = sum(col2 == "a"), default = 0))
# A tibble: 9 x 3
# Groups:   PERNO [2]
#  PERNO col2   col3
#  <int> <chr> <dbl>
#1     1 b         3
#2     1 d         3
#3     1 a         4
#4     1 d         6
#5     2 v         2
#6     2 a         3
#7     2 a         4
#8     2 x         5
#9     2 h         6

另一种选择是找到最后一个出现的“ a”的位置,使用case_when在该位置之后加1。

df1 %>% 
  group_by(PERNO) %>% 
  mutate(col3 = if('a' %in% col2) case_when(row_number() > 
         tail(which(col2 == 'a'), 1) ~
          col3 + 1L, TRUE ~ col3 ) else col3)

或避免使用if/else情况的pmax

df1 %>% 
   group_by(PERNO) %>%
   mutate(col3 = case_when(row_number() > pmin(n(), 
     tail(which(col2 == 'a'), 1)[1], na.rm = TRUE) ~ col3 + 1L,
         TRUE ~ col3))

或使用data.table

library(data.table)
i1 <- setDT(df1)[,   .I[.I > tail(.I[col2 == 'a'], 1)], PERNO]$V1
df1[i1, col3 := col3 + 1L]

更新

使用OP的新数据集

df %>% 
    mutate_if(is.factor, as.character) %>%
    group_by(PERNO) %>%
    mutate(loop = case_when(row_number() > pmin(n(), 
         tail(which(TPURP == "(24) Loop trip"), 1)[1], na.rm = TRUE) ~ loop + 1,
           TRUE ~ loop))
# A tibble: 6 x 3
# Groups:   PERNO [1]
#  PERNO  loop TPURP                        
#  <dbl> <dbl> <chr>                        
#1     1     2 (2) All other home activities
#2     1     1 (2) All other home activities
#3     1     2 (24) Loop trip               
#4     1     3 (15) Household errands       
#5     1     3 (15) Household errands       
#6     1     3 (15) Household errands       

数据

df1 <- structure(list(PERNO = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 
    col2 = c("b", "d", "a", "d", "v", "a", "a", "x", "h"), col3 = c(3L, 
    3L, 4L, 5L, 2L, 3L, 4L, 4L, 5L)), class = "data.frame", row.names = c(NA, 
-9L))


df9 %>% 
  group_by(PERNO) %>% 
  summarise(Sum = sum(grepl("(24) Loop trip", TPURP, fixed = TRUE)))

给予

PERNO   Sum
  <dbl> <int>
1     1   483
2     2   268
3     3    60
4     4    39
5     5    16
6     6    11
7     7     0
8     8     0

如果我们用OP的完整数据检查输出

fileN <- 'df.csv'
df <- read.csv(fileN, stringsAsFactors = FALSE)

out <- df %>% 
     group_by(PERNO) %>%
     mutate(loop = case_when(row_number() > pmin(n(), 
                                                   tail(which(TPURP == "(24) Loop trip"), 1)[1],
                                                   na.rm = TRUE) ~ loop + 1L,
                          TRUE ~ loop))

-检查“ loop”列中字符串的“ last”匹配项中第一个“ PERNO”的输出

 df %>% 
    filter(PERNO == 1) %>% 
    select(TPURP, loop) %>% 
    filter(row_number() >= tail(which(TPURP == "(24) Loop trip"), 1)[1]) %>%
     pull(loop) %>% 
     head(10)
#[1] 2 2 2 2 1 2 2 1 2 2

out %>%
    ungroup %>% 
    filter(PERNO == 1) %>%
    select(TPURP, loop) %>%
    filter(row_number() >= tail(which(TPURP == "(24) Loop trip"), 1)[1]) %>%
    pull(loop) %>% 
    head(10)
#[1] 2 3 3 3 2 3 3 2 3 3

注意,这些值相加1