Question

我的R问题如下：我有一个来自SQL数据库的data.frame（比方说安全现金流：'cf_table'）。主键有3列：

security_id, quote_date, future_cf_date, (and 'x')

在第4列（例如'x'）上，我将进行一些计算，返回一个向量（而不是单个值），在我的示例中rev(cumsum(rev(x))) - 向后累积的总和，按第一列分组两个cols 。换句话说：“报价日的证券未来现金流量的向后累计金额是多少？” 'x'稀疏，它主要有NA。我怎样才能完成这项任务？我尝试了dplyr，data.table等但没有成功。我的目标是将这个新列附加到原始表中。

有关重现性，请参阅我的帖子的结尾。

有什么想法吗？（顺便说一下，rev(cumsum(rev(x)))有效还是优雅？）

示例数据：

cf_table <- structure(list(security_id = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", 
"b"), class = "factor"), quote_date = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("2014.05.13", 
"2015.04.13", "2015.04.14", "2015.04.15"), class = "factor"), 
    CF.Dátum = structure(c(3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 
    3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 3L, 5L, 6L, 7L, 8L, 9L, 
    10L, 11L, 1L, 2L, 4L, 1L, 2L, 4L, 1L, 2L, 4L), .Label = c("2014.12.22", 
    "2015.06.22", "2015.06.24", "2015.12.22", "2016.06.24", "2017.06.26", 
    "2018.06.25", "2019.06.24", "2020.06.24", "2021.06.24", "2022.06.24"
    ), class = "factor"), future_cf_date = c(NA, NA, NA, NA, 
    NA, 2000L, NA, 10000L, NA, NA, NA, NA, NA, 2000L, NA, 10000L, 
    NA, NA, NA, NA, NA, NA, NA, 10000L, NA, 500L, 10000L, NA, 
    NA, 10000L, NA, NA, 10000L), My.desired.output = c(12000L, 
    12000L, 12000L, 12000L, 12000L, 12000L, 10000L, 10000L, 12000L, 
    12000L, 12000L, 12000L, 12000L, 12000L, 10000L, 10000L, 10000L, 
    10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10500L, 
    10500L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L, 10000L
    )), .Names = c("security_id", "quote_date", "future_cf_date", "x", 
"My.desired.output"), class = "data.frame", row.names = c(NA, 
-33L))

Answer 1

您可以使用Reduce函数并从向量x的右侧积累，这将向后移动cumsum：

library(dplyr)
cf_table_reduce = function() cf_table %>% group_by(security_id, quote_date) %>% 
  mutate(back_sum = Reduce(function(i,j) sum(i,j,na.rm = T), x, right = T, accumulate = T))

rev(cumsum(rev))的另一个选项是将x中的NA值替换为零，因为cumsum函数无法处理NA值：

cf_table_rev = function() cf_table %>% group_by(security_id, quote_date) %>% 
  mutate(x = replace(x, is.na(x), 0), back_sum = rev(cumsum(rev(x))))

结果：

identical(cf_table_rev(), cf_table_reduce())
# [1] TRUE

sum(cf_table_rev()$back_sum == cf_table$My.desired.output) == nrow(cf_table)
# [1] TRUE

至于速度，这两种方法似乎很接近：

microbenchmark(cf_table_rev(), cf_table_reduce())
# Unit: milliseconds
#               expr      min       lq     mean   median       uq      max neval
#     cf_table_rev() 212.2586 225.9167 332.3184 410.3508 431.9465 452.0192   100
#  cf_table_reduce() 211.2370 225.0572 331.7268 412.5145 432.1195 453.0889   100

我用于比较的数据的维度是：

dim(cf_table)
# [1] 2162688       5

Answer 2

我们可以使用ave中的base R而不使用任何软件包。

with(cf_table, ave(replace(x, is.na(x), 0), security_id, quote_date, 
           FUN = function(x) rev(cumsum(rev(x)))))
#[1] 12000 12000 12000 12000 12000 12000 10000 10000 12000 12000 12000 12000 12000 12000 10000 10000 10000 10000 10000 10000 10000 10000 10000 10000
#[25] 10500 10500 10000 10000 10000 10000 10000 10000 10000

基准

cf_ave <- function() with(cf_table, ave(replace(x, is.na(x), 0), 
    security_id, quote_date, 
        FUN = function(x) rev(cumsum(rev(x)))))
system.time(cf_ave())
#   user  system elapsed 
#      0       0       0 

system.time(cf_table_reduce())
#  user  system elapsed 
#  0.00    0.05    0.06 

system.time(cf_table_rev())
#   user  system elapsed 
#  0.01    0.00    0.02 

library(microbenchmark)
microbenchmark(cf_ave(), cf_table_reduce(), cf_table_rev(),
                      unit = "relative", times = 20L)
#Unit: relative
#            expr       min       lq     mean   median       uq      max neval
#         cf_ave()  1.000000 1.000000 1.000000 1.000000 1.000000  1.00000    20
#cf_table_reduce() 11.098189 9.945726 9.339097 8.648005 8.489285 10.58431    20
#   cf_table_rev()  7.825883 7.090710 6.659378 6.031240 6.075992  8.93274    20

在更大的数据集上

cf_table1 <- copy(cf_table)
set.seed(24)
cf_table <- data.frame(security_id = sample(letters, 1e7, replace = TRUE), 
     quote_date = sample(unique(cf_table$quote_date), 1e7, replace = TRUE), 
    x= sample(10000:20000, 1e7, replace=TRUE), stringsAsFactors=FALSE)
cf_table$x[sample(1e7, 1e6, replace=FALSE)] <- NA
microbenchmark(cf_ave(), cf_table_reduce(), cf_table_rev(),
                  unit = "relative", times = 20L)
#Unit: relative
#             expr       min        lq      mean    median        uq       max neval
#         cf_ave()  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    20
#cf_table_reduce() 13.909809 13.898210 13.913709 13.778034 13.411617 16.466479    20
#   cf_table_rev()  1.077809  1.091534  1.086552  1.099303  1.074822  1.111652    20

如何将键控数据的聚合向量作为R中的新列附加

2 个答案:

基准