我正在寻找一种更有效的方法来重塑R中的data.table数据。
此刻,我正在循环进行以重塑多个时间序列的预测。
我得到了正确的答案,但是觉得方法论很不雅致(/un-data.table)。因此,我正在寻求SO社区,看看是否有更优雅的解决方案。
请参见下面的数据设置,以及两次尝试获得所需的答案。
# load libraries
require(data.table)
require(lubridate)
# set up data assumptions
id_vec <- letters
len_id_vec <- length(id_vec)
num_orig_dates <- 7
set.seed(123)
# create original data frame
orig <- data.table(ID=rep(id_vec,each=num_orig_dates),
date=rep(c(Sys.Date() %m+% months(0: (num_orig_dates-1))),times=len_id_vec),
most_recent_bal=unlist(lapply(round(runif(len_id_vec)*100),function(y){
y*cumprod(1+rnorm(num_orig_dates,0.001,0.002))})))
# add 24 months ahead predictions of balances using a random walk from the original dates
nrow_orig <- nrow(orig)
for(i in seq(24)){
orig[,paste0('pred',i,'_bal'):=most_recent_bal*(1+rnorm(nrow_orig,0.001,0.003))]
orig[,paste0('pred',i,'_date'):=date %m+% months(i)]
}
# First attempt
t0 <- Sys.time()
tmp1 <- rbindlist(lapply(unique(orig$ID),function(x){
orig1 <- orig[ID==x,]
bal_cols <- c('most_recent_bal',paste0('pred',seq(24),'_bal'))
date_cols <- c('date',paste0('pred',seq(24),'_date'))
# Go through each original date to realign predicted date and predicted balance
date_vec <- orig1$date
tmp <- rbindlist(lapply(date_vec,function(y){
tmp <- data.table(dates=as.Date(as.vector(t(orig1[date==y,date_cols,with=FALSE]))),
bals=as.vector(t(orig1[date==y,bal_cols,with=FALSE])))
tmp[,type:='prediction']
tmp[,date_prediction_run:=y]
# collect historical information too for plotting perposes.
tmp1 <- orig1[date<=y,c('date','most_recent_bal'),with=FALSE]
if(nrow(tmp1)!=0){
setnames(tmp1,c('date','most_recent_bal'),c('dates','bals'))
tmp1[,type:='history']
tmp1[,date_prediction_run:=y]
tmp <- rbind(tmp,tmp1)
}
tmp
}))
tmp[,ID:=x]
}))
t1 <- Sys.time()
t1-t0 #Time difference of 1.117216 secs
# Second Attempt: a slightly more data.table way which is faster but still very inelegant....
t2 <- Sys.time()
bal_cols <- c('most_recent_bal',paste0('pred',seq(24),'_bal'))
date_cols <- c('date',paste0('pred',seq(24),'_date'))
tmp1a <- rbindlist(lapply(unique(orig$ID),function(x){
tmp <- cbind(melt(orig[ID==x,c('date',bal_cols),with=FALSE],
measure.vars = bal_cols,value.name='bals')[,-('variable'),with=FALSE],
data.table(dates=melt(orig[ID==x,date_cols,with=FALSE],
measure.vars = date_cols)[,value]))
setnames(tmp,'date','date_prediction_run')
tmp[,type:='prediction']
tmp1 <- orig[ID==x,orig[ID==x & date<=.BY,c('date','most_recent_bal'),with=FALSE],by=date]
setnames(tmp1,c('date_prediction_run','dates','bals'))
tmp1[,type:='history']
setcolorder(tmp1,colnames(tmp1)[match(colnames(tmp),colnames(tmp1))])
tmp <- rbind(tmp,tmp1)
tmp[,ID:=x]
tmp
}))
t3 <- Sys.time()
t3-t2 # Time difference of 0.2309799 secs
答案 0 :(得分:3)
data.table
解决方案即分解我们正在做的事情,并且仅此第一遍即可。 N.B.之后,在下面,(稍后更新),我将通过将所有内容组合在一起来对解决方案进行某种程度的优化,例如通过组合步骤,链接,就地分配等,可以实现更好的解决方案,正如您所期望的那样,如果不首先了解此处介绍的分步操作(旨在向人们展示学习数据的目的),那么其可读性将大大降低。他们可能会找到解决方案。
# First Pass = Step-by-step (not optimized) just first work out a solution
library(data.table)
# Transform prediction data from `orig` data.table into long format
# i.e. by melting pred#_bal and pred#_date columns
pred_data <-
data.table::melt( orig,
measure = patterns("pred[0-9]+_bal", "pred[0-9]+_date"),
value.name = c("bals", "date_prediction_run"))
pred_data[, type := "prediction"] # add the 'type' column to pred_data (all are type="prediction")
# select desired columns in order
pred_data <- pred_data[, .( dates=date, bals, type, date_prediction_run, ID)]
# Collect historical information from the most_recent_bal column,
# which the OP wants for plotting purposes
graph_data <-
orig[ orig,
.(ID, dates=date, bals=most_recent_bal, date_prediction_run=x.date),
on=.(ID, date>=date)]
graph_data[, type := "history"] # these are all type="history"
# final output, combining the prediction data and the graph data:
output <- rbindlist(list(pred_data, graph_data), use.names=TRUE)
下面是我的“通过组合一些步骤和链接进行优化的第一步”。尽管如此,即使在下面我组合了一些步骤,也使用了链接,它看起来很简洁,代码下面的内容不会比上面的原始分步解决方案快,因为我将在发布结束时显示基准时间。我将在下面留下代码,因为它说明了一个优点,并提供了学习的机会。
通过结合一些步骤和链接进行优化的第一步(不快!)library(data.table)
# Transform prediction data into long format
# by melting pred#_bal and pred#_date columns
pred_data <-
data.table::melt( orig[, type := "prediction"], #add the type column to orig, before melting
measure = patterns("pred[0-9]+_bal", "pred[0-9]+_date"),
value.name = c("bals", "date_prediction_run")
)[, .( dates=date, bals, type, date_prediction_run, ID)] # chain, to select desired columns in order
# FINAL RESULT: rbindlist pred_data to historic data
pred_data <-
rbindlist( list( pred_data, orig[ orig[, type := "history"],
.(dates=date, bals=most_recent_bal, type, date_prediction_run=x.date, ID),
on=.(ID, date>=date)]
),
use.names=TRUE)
使用非常方便的microbenchmark
软件包测试计时:
Unit: milliseconds
expr min lq mean median uq max neval
h.l.m_first_attempt 1140.017957 1190.818176 1249.499493 1248.977454 1299.497679 1427.632140 100
h.l.m_second_attempt 231.380930 239.513223 254.702865 249.735005 262.516276 375.762675 100
krads_step.by.step 2.855509 2.985509 3.289648 3.059481 3.269429 6.568006 100
krads_optimized 2.909343 3.073837 3.555803 3.150584 3.554100 12.521439 100
基准测试结果表明data.table解决方案相对于OP解决方案而言是巨大的时序改进。太好了,那就是要问的:我们已经展示了data.table
的速度有多快,但我也希望它也可以简单易读!
但是,请不要在这里错过其他课程:
在查看微基准测试结果时,请注意我的两个解决方案的平均有效时间是如何相同的。一开始可能没有什么意义:为什么我的“分步”解决方案具有如此多的代码行,却和我尝试的“优化”解决方案一样快?
答案:如果您仔细观察,所有相同的步骤都会出现在我的两个解决方案中。是的,在我的“优化”解决方案中,我们正在连锁,您可能首先认为做的任务要比“逐步”实际说明的少。但是,作为基准测试结果应该告诉您的是,我们的分配工作并未减少!即在我们使用[]
将另一个操作“链接”在一起的每个点上,这实际上等同于使用<-
分配回原始DT。
如果可以全神贯注,那么您将可以更好地进行编程:您可以放心地跳过“链接”步骤,而使用<-
来逐步说明(更具可读性,更易于调试和更可维护的解决方案!
在这里您可以节省时间,这归结于查找不需要在循环或套用操作中不必要分配多次的位置。但这是我认为另一篇文章的主题!
如果您想在自己的代码上使用microbenchmark
,我所做的就是:
library(microbenchmark)
mbm <- microbenchmark(
h.l.m_first_attempt = {
# Pasted in h.l.m's first solution, here
},
h.l.m_second_attempt = {
# Pasted in h.l.m's second solution, here
},
krads_step.by.step = {
# Pasted in my first solution, here
},
krads_optimized = {
# Pasted in my second solution, here
},
times = 100L
)
mbm
如果要绘制图形,请遵循:
library(ggplot2)
autoplot(mbm)
答案 1 :(得分:0)
为此,我使用了return new RedirectView("/account", true);
和dplyr
进行了尝试,我觉得它稍微更优雅(没有reshape2
,即technically for loops)。它还节省了约0.04秒的运行时间。
apply
此行比您拥有的行数少182行,因为您两次拥有t0 = Sys.time()
# Extract predicted values in long form
trial_bal = reshape2::melt(orig, id.vars = c("ID", "date"), measure.vars =
c(colnames(orig)[grep("pred[0-9]{1,}_bal", colnames(orig))]))
colnames(trial_bal) = c("ID", "date_prediction_run", "type", "balance")
trial_bal$type = gsub("_bal", "", trial_bal$type)
trial_date = reshape2::melt(orig, id.vars = c("ID", "date"), measure.vars =
c(colnames(orig)[grep("pred[0-9]{1,}_date", colnames(orig))]))
colnames(trial_date) = c("ID", "date_prediction_run", "type", "dates")
trial_date$type = gsub("_date", "", trial_date$type)
trial = merge.data.frame(trial_date, trial_bal, by = c("ID", "date_prediction_run", "type"))
trial$type = "prediction"
trial = trial %>% select(dates, balance, type, date_prediction_run, ID)
# Extract historical values in long form
temp = orig[, c("ID", "date", "most_recent_bal")]
temp = merge(temp[, c("ID", "date")], temp, by = "ID", allow.cartesian = TRUE)
temp = temp[temp$date.x >= temp$date.y, ]
temp$type = "history"
temp = temp %>% select(dates = date.y, balance = most_recent_bal, type,
date_prediction_run = date.x, ID)
# Combine prediction and history
trial = rbind(trial, temp)
trial = trial %>% arrange(ID, date_prediction_run, desc(type), dates)
t1 = Sys.time()
t1 - t0 #Time difference of 0.1900001 secs
-在dates = date_prediction_run
type
下是一,在prediction
下是一。