dplyr对每个组应用超前/滞后

时间:2017-02-21 11:57:28

标签: r group-by dplyr

我正在尝试将前导/滞后应用于数据框中的组中的列。我有一个单独的数据框,提供铅值。必须在主题,节点和传感器之间匹配前导值。

示例数据:

dput(test_df)
structure(list(subj = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), class = "factor", .Label = c("c1", "c2")), node = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Node 1", "Node 2"), class = "factor"), 
    sensor = c(2600, 2600, 2600, 2600, 2600, 2610, 2610, 2610, 
    2610, 2610, 2620, 2620, 2620, 2620, 2620, 2630, 2630, 2630, 
    2630, 2630, 2600, 2600, 2600, 2600, 2600, 2610, 2610, 2610, 
    2610, 2610, 2620, 2620, 2620, 2620, 2620, 2630, 2630, 2630, 
    2630, 2630), env_vals = c(5.33510151261835, 5.37708998203619, 
    5.18984848232565, 6.82992070825272, 5.92982096601743, 7.05707692156306, 
    7.67415658214675, 7.34534719192697, 6.5280531083936, 4.42063211468128, 
    4.98606873099945, 6.71683566611408, 7.04201828330796, 3.22384043747125, 
    7.16178630140025, 3.97134044753568, 3.06904118833596, 6.10839825474766, 
    2.51080443592448, 1.62815576579611, 4.5366549039861, 4.05204500710188, 
    8.50974398925943, 0.454711437225098, 7.63457277730028, 7.73074760170432, 
    1.7535421576035, 1.255666521349, 2.67319773682482, 1.61263970508914, 
    6.84515776718986, 4.319997054675, 5.64959416239443, 1.52348658940225, 
    4.05659367113441, 5.19205390068456, 2.41995034428535, 4.81929265375379, 
    4.65957617474215, 3.85295676615691)), .Names = c("subj", 
"node", "sensor", "env_vals"), row.names = c(NA, -40L), class = "data.frame")

dput(cc_df)
structure(list(subj = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("c1", "c2"), class = "factor"), 
    node = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L), .Label = c("Node 1", "Node 2"), class = "factor"), 
    sensor = c(2600, 2610, 2620, 2630, 2600, 2610, 2620, 2630, 
    2600, 2610, 2620, 2630, 2600, 2610, 2620, 2630), lg_val = c(1, 
    1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)), .Names = c("subj", 
"node", "sensor", "lg_val"), row.names = c(NA, -16L), class = "data.frame")

循环cc_df中的主题,节点,传感器组合,然后使用这些组合来应用相关的超前/滞后值。

for(i in 1:nrow(cc_df)){
  sbj <- as.character(cc_df[i,1])
  nd <- as.character(cc_df[i,2])
  sens <- cc_df[i,3]
  lg_val <- as.numeric(cc_df[i,4])
  # print(str(data.frame(sbj,nd,sens,lg_val)))

  # t2 <- filter(test_df, subj==sbj, node==nd, sensor==sens) %>% transmute_(lagged_env_sensor=lead(env_vals,lg_val))

  test_df <- group_by(test_df, subj==sbj, node==nd, sensor==sens) %>% mutate(lagged_env_sensor=lead(env_vals,lg_val))
}

这给了我Error in eval(expr, envir, enclos) : cannot modify grouping variable

如果我取消注释上面的t2部分,我会获得领先/滞后(以及关于列匹配的一些不相关的列)。

因此,我的问题是如何将正确的超前/滞后值应用于test_df的正确分组?

感谢。

伊恩

1 个答案:

答案 0 :(得分:0)

听起来您想执行join操作:

library(dplyr)
new_df <- left_join(test_df, cc_df, by=c("subj", "node", "sensor"))

现在,您在同一个表中同时拥有env_valslg_val,由主题,节点和传感器分配:

str(new_df)
# 'data.frame': 80 obs. of  5 variables:
#  $ subj    : Factor w/ 2 levels "c1","c2": 1 1 1 1 1 1 1 1 1 1 ...
#  $ node    : Factor w/ 2 levels "Node 1","Node 2": 1 1 1 1 1 1 1 1 1 1 ...
#  $ sensor  : num  2600 2600 2600 2600 2600 2600 2600 2600 2600 2600 ...
#  $ env_vals: num  5.34 5.34 5.38 5.38 5.19 ...
#  $ lg_val  : num  1 3 1 3 1 3 1 3 1 3 ...

您现在可以创建一个新列(尽管这会产生错误):

mutate(new_df, lagged_env_sensor=lead(env_vals, lg_val))