Question

我有一个项目数据集，我想为每个项目完成数据。在下面找到我尝试过的内容

x = structure(list(item = c("i1", "i1", "i1", "i1", "i2", "i2", "i2", 
"i2"), origin = c("A", "A", "B", "B", "C", "C", "D", "D"), destination = c("a", 
"a", "b", "b", "c", "c", "d", "d"), date = c("Q1", "Q2", "Q2", 
"Q3", "Q2", "Q3", "Q3", "Q4"), ton = 1:8), .Names = c("item", 
"origin", "destination", "date", "ton"), class = "data.frame", row.names = c(NA, 
-8L))

TEST1：这正是我想要的结果（但是，group_by在应用到包含很多项目的真实数据集时会严重减慢速度）：

x %>% 
  group_by(item ) %>%
  tidyr::complete(tidyr::nesting(origin, destination), date) %>%
  ungroup()

TEST2：这会为某些项目创建日期：行太多：

x %>% tidyr::complete(tidyr::nesting(item , origin, destination), date)

是否有一种方法可以完成此数据集以获得与TEST1相同的结果，但是没有group_by可以使其更快？还是等效的数据表？

谢谢

Answer 1

使用data.table并生成{日期}和{项，源和目的地组的索引}的排列。希望它会更快。

library(data.table)
setDT(x)

#create a group index for each combination of item, origin, destination
x[, g := .GRP, by=.(item, origin, destination)]

gcols <- c("origin","destination")
vcols <- c("ton")
#create the permutations of date and group
x[, CJ(g=g, date=date, unique=TRUE), by=.(item)][
    #lookup the original group values
    x, (gcols) := mget(paste0("i.", gcols)), on=.(item, g)][
        #lookup the other values
        x, (vcols) := mget(paste0("i.", vcols)), on=.(item, g, date)]

输出：

    item g date origin destination ton
 1:   i1 1   Q1      A           a   1
 2:   i1 1   Q2      A           a   2
 3:   i1 1   Q3      A           a  NA
 4:   i1 2   Q1      B           b  NA
 5:   i1 2   Q2      B           b   3
 6:   i1 2   Q3      B           b   4
 7:   i2 3   Q2      C           c   5
 8:   i2 3   Q3      C           c   6
 9:   i2 3   Q4      C           c  NA
10:   i2 4   Q2      D           d  NA
11:   i2 4   Q3      D           d   7
12:   i2 4   Q4      D           d   8

编辑：处理OP关于错误的评论：无法分配大小为3.3Gb的向量

我使用以下示例数据集运行代码，整个计算中RAM使用量约为700k。

library(data.table)
set.seed(0L)
sz <- 2e6
x <- data.table(item=rep(seq_len(sz/4), each=4L),
    origin=sample(LETTERS, sz, TRUE),
    destination=sample(letters, sz, TRUE),
    date=paste0("Q",sample(1:4, sz, TRUE)),
    ton=seq_len(sz))
setorder(x, item, origin, destination, date)

Answer 2

我没有一点的经验，没有ram限制，但是这是尝试将过程分解为更小的步骤，并在每个步骤中保持最少的数据量：

setDT(x)
gcols <- c("origin", "destination")
x[, g := .GRP, by = gcols]
setkey(x, g, date)
# Create a lookup table to refer to later so we can drop these columns
lut_g_od <- x[, .SD[1], by = g, .SDcols = gcols]
x[, (gcols) := NULL]
# Split by items... so we can work in stepwise fashion
x <- split(x, by = "item", keep.by = FALSE)
for (i in seq_along(x)) {
  x[[i]] <- x[[i]][CJ(g=g, date=date, unique=TRUE)]
}
x <- rbindlist(x, idcol = "item")
# Now if you want to get back in the original origin+destination
setkey(x, g)
x <- x[lut_g_od][, g := NULL]
x[]
#     item date ton origin destination
#  1:   i1   Q1   1      A           a
#  2:   i1   Q2   2      A           a
#  3:   i1   Q3  NA      A           a
#  4:   i1   Q1  NA      B           b
#  5:   i1   Q2   3      B           b
#  6:   i1   Q3   4      B           b
#  7:   i2   Q2   5      C           c
#  8:   i2   Q3   6      C           c
#  9:   i2   Q4  NA      C           c
# 10:   i2   Q2  NA      D           d
# 11:   i2   Q3   7      D           d
# 12:   i2   Q4   8      D           d

Answer 3

我们可以对来自group_split的{{1}}和来自dplyr的{{1}}使用“拆分应用合并”策略。

map_dfr

这是purrr的结果。 library(dplyr) library(tidyr) library(purrr) x %>% group_split(item) %>% map_dfr(~complete(.x, nesting(item, origin, destination), date)) # # A tibble: 12 x 5 # item origin destination date ton # <chr> <chr> <chr> <chr> <int> # 1 i1 A a Q1 1 # 2 i1 A a Q2 2 # 3 i1 A a Q3 NA # 4 i1 B b Q1 NA # 5 i1 B b Q2 3 # 6 i1 B b Q3 4 # 7 i2 C c Q2 5 # 8 i2 C c Q3 6 # 9 i2 C c Q4 NA # 10 i2 D d Q2 NA # 11 i2 D d Q3 7 # 12 i2 D d Q4 8和microbenchmark比单独的group_split快。

map_dfr

tidyR嵌套且完整：避免使用group_by（或等效于数据表）

3 个答案: