为用户定义的函数构建高效的循环:data.table

时间:2018-01-19 11:16:12

标签: r for-loop data.table

我试图为我在这里提出的这个函数构建一个有效的for循环:(Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table

我的数据是:

library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)

adherence <- cbind.data.frame(c("1", "2", "3", "1", "2", "3"), c("2013-01-01", "2013-01-01", "2013-01-01", "2013-02-01", "2013-02-01", "2013-02-01"))
names(adherence)[1] <- "ID" 
names(adherence)[2] <- "year"
adherence$year <- ymd(adherence$year)

lsr <- cbind.data.frame(
  c("1", "1", "1", "2", "2", "2", "3", "3"), #ID
  c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05"), #eksd
  c("60", "90", "90", "60", "120", "60", "30", "90") # DDD
)
names(lsr)[1] <- "ID"
names(lsr)[2] <- "eksd"
names(lsr)[3] <- "DDD"

lsr$eksd <- as.Date((lsr$eksd))
lsr$DDD <- as.numeric(as.character(lsr$DDD))
lsr$ENDDATE <- lsr$eksd + lsr$DDD
lsr <- as.data.table(lsr)

adherence <- as.data.table(adherence)

minem提出的函数是:

by_minem2 <- function(dt = lsr2) {
  d <- as.numeric(as.Date("2013-02-01"))
  dt[, ENDDATE2 := as.numeric(ENDDATE)]
  x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
  uid <- unique(dt$ID)
  id2 <- setdiff(uid, x$ID)
  id2 <- uid[!(uid %in% x$ID)]
  x2 <- data.table(ID = id2, V1 = 0)
  x <- rbind(x, x2)
  setkey(x, ID)
  x
}

返回:

> by_minem2(lsr)
   ID V1
1:  1 64
2:  2  0
3:  3 63

对于循环,我需要包含有关我评估的时间的信息,因此理想的重复输出如下所示:

cbind(as.Date("2013-02-01"),by_minem2(lsr))

然后我想在不同日期重复几次,将所有内容放入相同的data.table:

time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at

我试图用这样的for循环来做这件事:

     for (d in min(time.months):max(time.months))
{
  by_minem <- function(dt = lsr2) {
    d <- as.numeric(d)
    dt[, ENDDATE2 := as.numeric(ENDDATE)]
    x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
    uid <- unique(dt$ID)
    id2 <- setdiff(uid, x$ID)
    id2 <- uid[!(uid %in% x$ID)]
    x2 <- data.table(ID = id2, V1 = 0)
    x <- rbind(x, x2)
    setkey(x, ID)
    xtot <- append(xtot,x) 
    xtot <- cbind(d, xtot) # i need to know time of evaluation
    xtot
  }
}

2 个答案:

答案 0 :(得分:1)

类似的东西:

dt <- lsr
dt[, ENDDATE2 := as.numeric(ENDDATE)]
s <- time.months
xtot <- lapply(s, function(d) {
  d <- as.numeric(d)
  x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
  uid <- unique(dt$ID)
  id2 <- setdiff(uid, x$ID)
  id2 <- uid[!(uid %in% x$ID)]
  if (length(id2) > 0) {
    x2 <- data.table(ID = id2, V1 = 0)
    x <- rbind(x, x2)
  }
  setkey(x, ID)
  x
})
for (x in seq_along(xtot)) {
  setnames(xtot[[x]], c("ID", paste0("V", x)))
}

xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot)
xtot

答案 1 :(得分:1)

如相关问题Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table的答案所示,这可以通过在非等连接中进行更新来解决,这可能与data.table有关。

链接问题的不同之处在于,我们需要在加入CJ()之前,使用自己的日期向量创建所有唯一ID的交叉联接lsr

OP提供了一系列日期time.months,其定义

time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at

导致&#34;歪曲&#34;日期只有在强制使用数字或POSIXct时才可见:

head(lubridate::as_datetime(time.months))
[1] "2013-02-01 00:00:00 UTC" "2013-03-03 10:30:00 UTC" "2013-04-02 21:00:00 UTC"
[4] "2013-05-03 07:30:00 UTC" "2013-06-02 18:00:00 UTC" "2013-07-03 04:30:00 UTC"

问题是这些&#34;日期&#34;不与午夜对齐,而是在白天的某个地方开始。为避免这些歧义,可以使用seq()函数

dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")

从每个月的第一天开始创建一系列日期。

此外,使用data.table&lt; {1}}类,它将日期存储为整数(4个字节)而不是双(8个字节)。这样可以节省内存和处理时间,因为可以使用通常更快的整数运算。

IDate
# coerce Date to IDate
idates <- as.IDate(dates)
setDT(lsr)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]

# cross join unique IDs with dates 
CJ(ID = lsr$ID, date = idates, unique = TRUE)[
  # intialize result column
  , AH := 0L][
    # non-equi join and ...
    lsr, on = .(ID, date >= eksd, date < ENDDATE), 
    # ... update only matching rows
    AH := as.integer(ENDDATE - x.date)][
      # reshape from long to wide format
      , dcast(.SD, ID ~ date)]

买者

请注意,上面的代码假定每个 ID 2013-02-01 2013-03-01 2013-04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...] 1: 1 64 36 5 0 0 0 0 2: 2 0 0 110 80 49 19 0 3: 3 63 35 4 0 0 0 0 的间隔[eksd, ENDDATE)重叠。这可以通过

验证
ID
lsr[order(eksd), all(eksd - shift(ENDDATE, fill = 0) > 0), keyby = ID]

如果存在重叠,可以使用 ID V1 1: 1 TRUE 2: 2 TRUE 3: 3 TRUE 修改上述代码以在非等连接中聚合。

基准

在另一个相关问题data.table by = xx How do i keep the groups of length 0 when i returns no match中,OP指出,由于其生产数据的大小,性能至关重要。

根据OP's comment by = .EACHI有20 mio行和12列,lsr数据集,我试图不使用的有1.5 mio 2列的行。在另一个question中,OP提到 adherence是几百mio。行

@minem通过在answer中提供基准来回应这个问题。我们可以使用此基准数据来比较不同的答案。

lsr

因此,基准数据集由400 k行和150 k唯一# create benchmark data lsr <- data.frame( ID = c("1", "1", "1", "2", "2", "2", "3", "3"), eksd = as.Date(c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05")), DDD = as.integer(c("60", "90", "90", "60", "120", "60", "30", "90")), stringsAsFactors = FALSE) lsr$ENDDATE <- lsr$eksd + lsr$DDD n <- 5e4 lsr2 <- lapply(1:n, function(x) lsr) lsr2 <- rbindlist(lsr2, use.names = T, fill = T, idcol = T) lsr2[, ID := as.integer(paste0(.id, ID))] s组成:

ID
lsr2[, .(.N, uniqueN(ID))]
        N     V2
1: 400000 150000

一次运行的结果显示非equi join # pull data preparation out of the benchmark lsr2i <- copy(lsr2)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)] lsr2[, ENDDATE2 := as.numeric(ENDDATE)] # define date series dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month") idates <- seq(as.IDate("2013-02-01"), length.out = 193, by = "month") # run benchmark library(microbenchmark) bm <- microbenchmark( minem = { dt <- copy(lsr2) xtot <- lapply(dates, function(d) { d <- as.numeric(d) x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID] uid <- unique(dt$ID) id2 <- setdiff(uid, x$ID) id2 <- uid[!(uid %in% x$ID)] if (length(id2) > 0) { x2 <- data.table(ID = id2, V1 = 0) x <- rbind(x, x2) } setkey(x, ID) x }) for (x in seq_along(xtot)) { setnames(xtot[[x]], c("ID", paste0("V", x))) } xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot) xtot }, uwe = { dt <- copy(lsr2i) CJ(ID = dt$ID, date = idates, unique = TRUE)[, AH := 0L][ dt, on = .(ID, date >= eksd, date < ENDDATE), AH := as.integer(ENDDATE - x.date)][, dcast(.SD, ID ~ date)] }, times = 1L ) print(bm) 方法快4倍以上。

lapply()