对于我的每一行数据,我想计算每个value
的最新group
的总和:
dt = data.table(group = c('a','b','a','a','b','a'),
value = c(10, 5, 20, 15, 15, 10),
desired = c(10, 15, 25, 20, 30, 25))
# group value desired
#1: a 10 10
#2: b 5 15
#3: a 20 25 # latest value of a is 20, of b is 5
#4: a 15 20 # latest value of a is 15, of b is 5
#5: b 15 30
#6: a 10 25
desired
列是我想要实现的,我可以通过一个天真的循环来做到这一点,但是我的数据非常庞大,有很多行和组(1M +行,1000多个组)。
for (i in seq_len(nrow(dt))) {
# can use `set` to make this faster, but still too slow
# this is just to illustrate *a* solution
dt[i, desired1 := dt[1:i, value[.N], by = group][, sum(V1)]]
}
答案 0 :(得分:18)
甚至更简单的来自@eddi(在评论下)的逻辑减少了如下所示的环形交叉口:
dt[, incr := diff(c(0, value)), by = group][, ans := cumsum(incr)]
不确定它是如何扩展到更多组的,但是这里有3组的示例数据:
# I hope I got the desired output correctly
require(data.table)
dt = data.table(group = c('a','b','c','a','a','b','c','a'),
value = c(10, 5, 20, 25, 15, 15, 30, 10),
desired = c(10, 15, 35, 50, 40, 50, 60, 55))
添加rleid
:
dt[, id := rleid(group)]
提取每个group, id
的最后一行:
last = dt[, .(value=value[.N]), by=.(group, id)]
last
将具有唯一id
。现在的想法是获得每个id
的增量,然后加入+更新回来。
last = last[, incr := value - shift(value, type="lag", fill=0L), by=group
][, incr := cumsum(incr)-value][]
加入+立即更新:
dt[last, ans := value + i.incr, on="id"][, id := NULL][]
# group value desired ans
# 1: a 10 10 10
# 2: b 5 15 15
# 3: c 20 35 35
# 4: a 25 50 50
# 5: a 15 40 40
# 6: b 15 50 50
# 7: c 30 60 60
# 8: a 10 55 55
我还不确定这会打破的地方......现在仔细看看。我立刻写下来,以便有更多的目光。
使用David的解决方案比较500组10,000行:
require(data.table)
set.seed(45L)
groups = apply(matrix(sample(letters, 500L*10L, TRUE), ncol=10L), 1L, paste, collapse="")
uniqueN(groups) # 500L
N = 1e4L
dt = data.table(group=sample(groups, N, TRUE), value = sample(100L, N, TRUE))
arun <- function(dt) {
dt[, id := rleid(group)]
last = dt[, .(value=value[.N]), by=.(group, id)]
last = last[, incr := value - shift(value, type="lag", fill=0L), by=group
][, incr := cumsum(incr)-value][]
dt[last, ans := value + i.incr, on="id"][, id := NULL][]
dt$ans
}
david <- function(dt) {
dt[, indx := .I]
res <- dcast(dt, indx ~ group)
for (j in names(res)[-1L])
set(res, j = j, value = res[!is.na(res[[j]])][res, on = "indx", roll = TRUE][[j]])
rowSums(as.matrix(res)[, -1], na.rm = TRUE)
}
system.time(ans1 <- arun(dt)) ## 0.024s
system.time(ans2 <- david(dt)) ## 38.97s
identical(ans1, as.integer(ans2))
# [1] TRUE
答案 1 :(得分:2)
我会为每个组创建一个列,显示该组的最新值。然后将这些列加起来:
library(zoo)
result <- rep(0, nrow(dt))
for(g in dt[, unique(group)]) {
result <- result + dt[, na.fill(na.locf(ifelse(group==g, 1, NA)*value, na.rm=F), 0)]
}
all(dt[, desired] == result)
答案 2 :(得分:-2)
使用dplyr,适用于多个组,但数据不能是数据表。
library(dplyr)
library(tidyr)
library(zoo)
dt %>%
mutate(row_number = row_number()) %>%
spread(group, value) %>%
arrange(row_number) %>%
mutate_each(funs(na.locf(., na.rm = FALSE))) %>%
mutate(answer = rowSums(.[,-1:-2], na.rm = T))
在示例数据上使用上述功能(注意data.frame()
而不是data.table()
:
dt = data.frame(group = c('a','b','a','a','b','a'),
value = c(10, 5, 20, 15, 15, 10),
desired = c(10, 15, 25, 20, 30, 25))
desired row_number a b answer
1 10 1 10 NA 10
2 15 2 10 5 15
3 25 3 20 5 25
4 20 4 15 5 20
5 30 5 15 15 30
6 25 6 10 15 25
dt = data.frame(group = c('a','b','c','a','a','b','c','a'),
value = c(10, 5, 20, 25, 15, 15, 30, 10),
desired = c(10, 15, 35, 50, 40, 50, 60, 55))
desired row_number a b c answer
1 10 1 10 NA NA 10
2 15 2 10 5 NA 15
3 35 3 10 5 20 35
4 50 4 25 5 20 50
5 40 5 15 5 20 40
6 50 6 15 15 20 50
7 60 7 15 15 30 60
8 55 8 10 15 30 55