来自前一组的携带/使用价值

时间:2018-04-10 13:27:50

标签: r data.table tidyverse

数据:

structure(list(id = c(1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 5), 
               ax = c("a", "a", "b", "b", "b", "b", "b", "b", "c", "c", 
                      "d", "d", "e"), time = c(1, 3, 0, 2, 4, 5, 6, 8, 7, 9, 10, 
                                               11, 12)), .Names = c("id", "ax", "time"), class = c("data.table", 
                                                                                                   "data.frame"), row.names = c(NA, -13L))

看起来像:

    id ax time
 1:  1  a    1
 2:  1  a    3
 3:  2  b    0
 4:  2  b    2
 5:  2  b    4
 6:  2  b    5
 7:  2  b    6
 8:  2  b    8
 9:  3  c    7
10:  3  c    9
11:  4  d   10
12:  4  d   11
13:  5  e   12

我希望将前一组的max放在实际组旁边:

期望的输出:

    id ax time newCol
 1:  1  a    1     NA
 2:  1  a    3     NA
 3:  2  b    0      3
 4:  2  b    2      3
 5:  2  b    4      3
 6:  2  b    5      3
 7:  2  b    6      3
 8:  2  b    8      3
 9:  3  c    7      8
10:  3  c    9      8
11:  4  d   10      9
12:  4  d   11      9
13:  5  e   12     11

是否也可以拥有“previous-previous”grp的值?

强调baseRdata.tabletidyverse解决方案

注意:

可以按E idax进行分组。这个例子在这里有点多余。

5 个答案:

答案 0 :(得分:4)

data.table解决方案:

dtt.max <- dtt[, .(max = max(time)), by = ax]
dtt.max[, max.prev := shift(max)]

dtt[dtt.max, newCol := i.max.prev, on = 'ax']
# > dtt
#     id ax time newCol
#  1:  1  a    1     NA
#  2:  1  a    3     NA
#  3:  2  b    0      3
#  4:  2  b    2      3
#  5:  2  b    4      3
#  6:  2  b    5      3
#  7:  2  b    6      3
#  8:  2  b    8      3
#  9:  3  c    7      8
# 10:  3  c    9      8
# 11:  4  d   10      9
# 12:  4  d   11      9
# 13:  5  e   12     11

答案 1 :(得分:3)

使用data.table

id + 1解决方案

library(data.table)
merge(d, setDT(d)[, max(time), id + 1], all.x = TRUE)

答案 2 :(得分:2)

这是一种dplyr方法。这里的关键是在必要时进行分组和取消分组:

df %>% 
  group_by(ax) %>% 
  mutate(new = time[n()]) %>%
  ungroup() %>%
  mutate(new = lag(new)) %>%
  group_by(ax) %>%
  mutate(new = new[1])

# A tibble: 13 x 4
# Groups:   ax [5]
      id ax     time   new
   <dbl> <chr> <dbl> <dbl>
 1    1. a        1.   NA 
 2    1. a        3.   NA 
 3    2. b        0.    3.
 4    2. b        2.    3.
 5    2. b        4.    3.
 6    2. b        5.    3.
 7    2. b        6.    3.
 8    2. b        8.    3.
 9    3. c        7.    8.
10    3. c        9.    8.
11    4. d       10.    9.
12    4. d       11.    9.
13    5. e       12.   11.

答案 3 :(得分:1)

假设idgroup相同:

dfr <- dfr %>% group_by(id) %>% mutate(groupmax = max(time))
dfr$old_group_max <- dfr$groupmax[match(dfr$id - 1, dfr$id)]

倒数第三组留作练习: - )

答案 4 :(得分:1)

1)这不使用任何包。它计算每个组给出Ag的最大值,然后滞后给予LagMax。最后,它使用merge连接到原始数据框DF

Ag <- aggregate(time ~ id, DF, max)
LagMax <- transform(Ag, lagmax = c(NA, head(time, -1)), time = NULL)
merge(DF, LagMax, by = "id", all.x = TRUE)

,并提供:

   id ax time lagmax
1   1  a    1     NA
2   1  a    3     NA
3   2  b    0      3
4   2  b    2      3
5   2  b    4      3
6   2  b    5      3
7   2  b    6      3
8   2  b    8      3
9   3  c    7      8
10  3  c    9      8
11  4  d   10      9
12  4  d   11      9
13  5  e   12     11

2)这会在id中对时间进行排序,以便我们知道最大值是每个id组中的最后一个值。

o <- order(factor(DF$id, levels = unique(DF$id)), DF$time)
Time <- DF$time[o]
lagmax <- function(r) if (r[1] == 1) NA else Time[r[1] - 1]
transform(DF, lagmax = ave(seq_along(id), id, FUN = lagmax))

在问题中,time值已在id内排序,如果已知情况如此,则上述内容可缩短为:

lagmax <- function(r) if (r[1] == 1) NA else DF$time[r[1] - 1]
transform(DF, lagmax = ave(seq_along(id), id, FUN = lagmax))

3)这个单行是(2)的数据表格翻译:

library(data.table)
DT <- copy(DF) # don't overwrite DF

setDT(DT)[, g:=rleid(id)][, lagmax := DT$time[.I[1]-1], keyby = c("g", "id")]

问题time中的示例数据在id内排序,如果知道是这种情况,我们可以使用以下较短的代码代替上面的最后一行

setDT(DT)[, lagmax := DT$time[.I[1]-1], by = id]