通过基于多个条件汇总特定列在R中创建新列

时间:2019-12-10 11:26:28

标签: r dplyr

我正在尝试从下面的数据框中创建一个名为period的新列

structure(list(fw01 = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7, 15, 21.4), fw02 = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
   fw03 = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
   167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
   71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
   301, 121), fw04 = c(110, 110, 93, 110, 175, 105, 245, 62, 
   95, 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 
   150, 150, 245, 175, 66, 91, 113, 264, 175, 335, 109), fw05 = c(3.9, 
   3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
   3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
   3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
   ), fw06 = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
   3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
   1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
   1.513, 3.17, 2.77, 3.57, 2.78), fw07 = c(16.46, 17.02, 18.61, 
   19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
   18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
   17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
   ), fw08 = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
   0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), fw09 = c(1, 
   1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
   0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), fw10 = c(4, 4, 4, 3, 
   3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
   3, 3, 4, 5, 5, 5, 5, 5, 4), fw11 = c(4, 4, 1, 1, 2, 1, 4, 
   2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
   2, 2, 4, 6, 8, 2)), class = "data.frame", row.names = c(NA, 
-32L))

这些是我根据要求定义的期间

p1 <- c("fw01","fw02","fw03","fw04")
p2 <- c("fw05","fw06","fw07","fw08")
p3 <- c("fw09","fw10","fw11","fw12","fw13")
p4 <- c("fw14","fw15","fw16","fw17")
p5 <- c("fw18","fw19","fw20","fw21")
p6 <- c("fw22","fw23","fw24","fw25","fw26")
p7 <- c("fw27","fw28","fw29","fw30")
p8 <- c("fw31","fw32","fw33","fw34")
p9 <- c("fw35","fw36","fw37","fw38","fw39")
p10 <- c("fw40","fw41","fw42","fw43")
p11 <- c("fw44","fw45","fw46","fw47")
p12 <- c("fw48","fw49","fw50","fw51","fw52")

我的要求是,period列应为第一期间的几周之和,即fw01+fw02+fw03+fw04,如果表最后一列中的星期数属于第二期间,以此类推。在上面的示例中,最后一列是第11周(fw11),属于第3个周期,根据定义的向量为p3。在这种情况下,我希望period列是period2中周数的总和,即fw05+fw06+fw07+fw08。为此,我编写了2种不同的逻辑

第一种方法是使用dplyr::case_when

dplyr::mutate(df2,
                     prev_per = case_when(rev(names(df2))[1] %in% p2 ~ fw01+fw02+fw03+fw04,
                                        rev(names(df2))[1] %in% p3 ~ fw05+fw06+fw07+fw08,
                                        rev(names(df2))[1] %in% p4 ~ fw09+fw10+fw11+fw12+fw13,
                                        rev(names(df2))[1] %in% p5 ~ fw14+fw15+fw16+fw17,
                                        rev(names(df2))[1] %in% p6 ~ fw18+fw19+fw20+fw21,
                                        rev(names(df2))[1] %in% p7 ~ fw22+fw23+fw24+fw25+fw26,
                                        rev(names(df2))[1] %in% p8 ~ fw27+fw28+fw29+fw30,
                                        rev(names(df2))[1] %in% p9 ~ fw31+fw32+fw33+fw34,
                                        rev(names(df2))[1] %in% p10 ~ fw35+fw36+fw37+fw38+fw39,
                                        rev(names(df2))[1] %in% p11 ~ fw40+fw41+fw42+fw43,
                                        rev(names(df2))[1] %in% p12 ~ fw44+fw45+fw46+fw47))

上面的问题是,数据框中根本没有其他列。理想情况下,第二个case_when本身会满足条件,但是操作不会中断,这会导致错误说fw12 not found

第二种方法的示例使用的是基本软件包中的ifelse函数。

df7<- dplyr::mutate(df2,
                    prev_per = ifelse(rev(names(df2))[1] %in% p2, fw01+fw02+fw03+fw04,
                               ifelse(rev(names(df2))[1] %in% p3, fw05+fw06+fw07+fw08,
                               ifelse(rev(names(df2))[1] %in% p4, fw09+fw10+fw11+fw12+fw13))))

在此情况下,该操作在满足条件时可以正常中断,但是在列period的所有行中返回的数字如下所示

structure(list(fw01 = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
19.7, 15, 21.4), fw02 = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
    fw03 = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
    167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
    71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
    301, 121), fw04 = c(110, 110, 93, 110, 175, 105, 245, 62, 
    95, 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 
    150, 150, 245, 175, 66, 91, 113, 264, 175, 335, 109), fw05 = c(3.9, 
    3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
    3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
    3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
    ), fw06 = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
    3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
    1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
    1.513, 3.17, 2.77, 3.57, 2.78), fw07 = c(16.46, 17.02, 18.61, 
    19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
    18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
    17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
    ), fw08 = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), fw09 = c(1, 
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), fw10 = c(4, 4, 4, 3, 
    3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
    3, 3, 4, 5, 5, 5, 5, 5, 4), fw11 = c(4, 4, 1, 1, 2, 1, 4, 
    2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
    2, 2, 4, 6, 8, 2), prev_per = c(22.98, 22.98, 22.98, 22.98, 
    22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 
    22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 
    22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 22.98, 
    22.98)), class = "data.frame", row.names = c(NA, -32L))

请提出如何解决此问题的建议。

2 个答案:

答案 0 :(得分:1)

对于一个真正糟糕的解决方案,您可以利用case_when忽略NULL输入而没有if的{​​{1}}会返回else的事实。

因此,将条件移动到NULL中,然后跟随if来强制求值。

TRUE

我讨厌这个答案,但它似乎有效。输出:

df %>%
  mutate(
    prev_per = case_when(
      if (rev(names(df))[1] %in% p2) TRUE ~ fw01+fw02+fw03+fw04,
      if (rev(names(df))[1] %in% p3) TRUE ~ fw05+fw06+fw07+fw08,
      if (rev(names(df))[1] %in% p4) TRUE ~ fw09+fw10+fw11+fw12+fw13,
      if (rev(names(df))[1] %in% p5) TRUE ~ fw14+fw15+fw16+fw17,
      if (rev(names(df))[1] %in% p6) TRUE ~ fw18+fw19+fw20+fw21
    )
  )

答案 1 :(得分:0)

一个(更希望是更通用的)替代方法是重塑数据并在由句点标识的列内求和。在这里,我通过创建一个键,将其与重塑的数据合并,然后在具有最高“切入点”的时间段内进行求和来完成此操作。

# Create a 'period' key
  pkey <-
    tibble(
      perID = c('p1','p2','p3','p4'),
      varlist = list(
        c("fw01","fw02","fw03","fw04"),
        c("fw05","fw06","fw07","fw08"),
        c("fw09","fw10","fw11","fw12","fw13"),
        c("fw14","fw15","fw16","fw17")
      )
    ) %>%
    unnest(cols = varlist) %>%
    # now create end-of-period indicator
    group_by(perID) %>%
    mutate(
      cutpt = if_else(varlist == max(varlist), 1,0)
    )

# Period sums w/i rows
  df %>%
    # unique ids
    rownames_to_column('rID') %>%
    # stack the data
    pivot_longer(
      -rID,
      names_to = 'varlist'
    ) %>%
    # merge the period key
    left_join(pkey) %>%
    group_by(rID) %>%
    # period as sum in period w/last cut point
    mutate(
      period = sum(value[perID == max(perID[cutpt == 1])])
    ) %>%
    # clean and widen
    select(rID,varlist,value,period) %>%
    pivot_wider(
      id_cols = c(rID,period),
      names_from = varlist,
      values_from = value
    ) 

比以前的解决方案更长,但可能更广泛地适用。