我有以下data.table
initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT')
PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
TradePrice=seq(from=1, to=length(PriorityDateTime),by = 1)
ndf<- data.frame(PriorityDateTime,TradePrice)
ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
res <- ndf %>% data.table()
看起来像这样:
> res
PriorityDateTime TradePrice InstrumentSymbol id datetime
1: 2018-10-27 10:00:00 1 asset1 1 2018-10-27 10:00:00
2: 2018-10-27 10:00:30 2 asset2 2 2018-10-27 10:00:30
3: 2018-10-27 10:01:00 3 asset1 3 2018-10-27 10:01:00
4: 2018-10-27 10:01:30 4 asset2 4 2018-10-27 10:01:30
5: 2018-10-27 10:02:00 5 asset1 5 2018-10-27 10:02:00
使用data.table
是最优雅,最快的方法:
datetime
最多60秒(时差小于60秒)且{{ 1}}。InstrumentSymbol
的{{1}}:在原始TradePrice
中获得TradePrice[i]
和{{1} }的另一行index
中的新列,例如重新组合为data.frame
和TradePrice
中的新列示例结果:
data.table
在index.minpricewithin60
中,我可以修复一行并将其用于条件。例如,如果我想获取第一个minpricewithin60
与该行的> res
PriorityDateTime TradePrice InstrumentSymbol id datetime minpricewithin60 index.minpricewithin60
1: 2018-10-27 10:00:00 1 asset1 1 2018-10-27 10:00:00 2 2
2: 2018-10-27 10:00:30 2 asset2 2 2018-10-27 10:00:30 4 4
3: 2018-10-27 10:01:00 3 asset1 3 2018-10-27 10:01:00 1 1
4: 2018-10-27 10:01:30 4 asset2 4 2018-10-27 10:01:30 2 2
5: 2018-10-27 10:02:00 5 asset1 5 2018-10-27 10:02:00 3 3
相同的第一个base
,则可以执行TradePrice
。您能否解释一下id
的联接如何实现相同的目标?
编辑:数据现在更大了,将考虑我可以在不到2.5分钟的时间里在我的体面的PC(i7 4750 2B,12GB RAM)上运行的所有答案。干杯。
答案 0 :(得分:2)
这可能有效:
res <- res[1:5,]
res2 <- setDT(res)
res2 <- res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
res2, on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
, .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
res2[]
id minpricewithin60 index.minpricewithin60 PriorityDateTime TradePrice InstrumentSymbol datetime
1: 1 3 3 2018-10-27 10:00:00 1 asset1 2018-10-27 10:00:00
2: 2 4 4 2018-10-27 10:00:30 2 asset2 2018-10-27 10:00:30
3: 3 1 1 2018-10-27 10:01:00 3 asset1 2018-10-27 10:01:00
4: 4 2 2 2018-10-27 10:01:30 4 asset2 2018-10-27 10:01:30
5: 5 3 3 2018-10-27 10:02:00 5 asset1 2018-10-27 10:02:00
答案 1 :(得分:2)
我对代码进行了分解,以使其更容易看到正在发生的问题并进行故障排除。确实,这只是最后一行需要任何时间。我还使价格数据更加有趣和可测试。它可以在我的笔记本电脑上运行约1.3分钟。
library(data.table)
library(lubridate)
set.seed(1)
initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT')
PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
TradePrice=runif(length(PriorityDateTime))
ndf<- data.frame(PriorityDateTime,TradePrice)
ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
setDT(ndf)
# Relevant Code
# Setup (Trivial Runtime):
ndf[, datetime_max := datetime + 60]
ndf[, datetime_min := datetime - 60]
ndf_x <- copy(ndf)
ndf_y <- copy(ndf)
names(ndf_x) <- paste0(names(ndf),"_x")
names(ndf_y) <- paste0(names(ndf),"_y")
ndf_join <- ndf_x[ndf_y,on = .(InstrumentSymbol_x = InstrumentSymbol_y, datetime_x >= datetime_min_y, datetime_x <= datetime_max_y), mult = "all", allow.cartesian = TRUE]
ndf_join <- ndf_join[id_x != id_y]
ndf_join[, price_delta := abs(TradePrice_y - TradePrice_x)]
这是花费时间最多的代码:
# Harworking Runtime:
time_now <- Sys.time()
ndf_out <- ndf_join[,.SD[which.min(price_delta), .(which_price = id_x, what_price = TradePrice_x)],
by = .(PriorityDateTime_y,TradePrice_y, id_y, InstrumentSymbol_x, datetime_y)]
cat(Sys.time() - time_now)
# 1.289397
输出:
ndf_out
PriorityDateTime_y TradePrice_y id_y InstrumentSymbol_x datetime_y which_price what_price
1: 2018-10-27 10:00:00 0.26550866 1 asset1 2018-10-27 10:00:00 3 0.57285336
2: 2018-10-27 10:00:30 0.37212390 2 asset2 2018-10-27 10:00:30 4 0.90820779
3: 2018-10-27 10:01:00 0.57285336 3 asset1 2018-10-27 10:01:00 1 0.26550866
4: 2018-10-27 10:01:30 0.90820779 4 asset2 2018-10-27 10:01:30 6 0.89838968
5: 2018-10-27 10:02:00 0.20168193 5 asset1 2018-10-27 10:02:00 3 0.57285336
---
179397: 2018-12-28 16:58:00 0.54342007 179397 asset1 2018-12-28 16:58:00 179395 0.55391579
179398: 2018-12-28 16:58:30 0.25181676 179398 asset2 2018-12-28 16:58:30 179400 0.28088354
179399: 2018-12-28 16:59:00 0.08879969 179399 asset1 2018-12-28 16:59:00 179401 0.19670841
179400: 2018-12-28 16:59:30 0.28088354 179400 asset2 2018-12-28 16:59:30 179398 0.25181676
179401: 2018-12-28 17:00:00 0.19670841 179401 asset1 2018-12-28 17:00:00 179399 0.08879969
答案 2 :(得分:2)
OP没有提到新数据集的大小。但是Rcpp
解决方案应该可以加快速度。
根据之前的评论:
mtd1 <- function() {
ndf[, rn:=.I]
iidx <- ndf[
.(inst=InstrumentSymbol, prevMin=datetime-60L, nextMin=datetime+60L, idx=id, tp=TradePrice),
.SD[id != idx, rn[which.min(abs(TradePrice - tp))]],
by=.EACHI,
on=.(InstrumentSymbol=inst, datetime>=prevMin, datetime<=nextMin)];
ndf[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
}
arg0naut的方法:
mtd2 <- function() {
res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
res2, on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
, .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
}
可能的Rcpp方法:
library(Rcpp)
cppFunction('
NumericVector nearestPrice(NumericVector id, NumericVector datetime, NumericVector price) {
int i, j, n = id.size();
NumericVector res(n);
double prev, diff;
for (i=0; i<n; i++) {
prev = 100000;
j = i-1;
while (datetime[j] >= datetime[i]-60 && j>=0) {
diff = std::abs(price[i] - price[j]);
if (diff < prev) {
res[i] = id[j];
prev = diff;
}
j--;
}
j = i+1;
while (datetime[j] <= datetime[i]+60 && j<=n) {
diff = std::abs(price[i] - price[j]);
if (diff < prev) {
res[i] = id[j];
prev = diff;
}
j++;
}
}
return(res);
}
')
mtd3 <- function() {
setorder(ndf2, InstrumentSymbol, PriorityDateTime)
iidx <- ndf2[, nearestPrice(.I, datetime, TradePrice), by=.(InstrumentSymbol)]
ndf2[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
}
计时代码:
library(microbenchmark)
microbenchmark(mtd1(), mtd2(), mtd3(), times=3L)
时间:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1() 49447.09713 49457.12408 49528.14395 49467.15103 49568.66737 49670.18371 3
mtd2() 64189.67241 64343.67138 64656.40058 64497.67034 64889.76466 65281.85899 3
mtd3() 17.33116 19.58716 22.36557 21.84316 24.88277 27.92238 3
数据:
set.seed(0L)
initial.date <- as.POSIXct('2018-01-01 00:00:00', tz='GMT')
last.date <- initial.date + 30 * (180000/2)
PriorityDateTime <- seq.POSIXt(from=initial.date, to=last.date, by='30 sec')
library(data.table)
ndf <- data.table(PriorityDateTime=c(PriorityDateTime, PriorityDateTime),
TradePrice=rnorm(length(PriorityDateTime)*2, 100, 20),
InstrumentSymbol=rep(c('asset1','asset2'), each=length(PriorityDateTime)),
datetime=c(PriorityDateTime, PriorityDateTime))
setorder(ndf, InstrumentSymbol, PriorityDateTime)[, id := .I]
res <- copy(ndf)
res2 <- copy(ndf)
ndf2 <- copy(ndf)
答案 3 :(得分:0)
对到目前为止提出的不同解决方案进行基准测试(作为基准,我的基本R方法用此数据花费了大约55分钟):
library(microbenchmark)
microbenchmark(Chris(),
chinsoon12.cpp(),
arg0naut(),
chinsoon12.data.table(), times=3L)
这是通过规格i5-6500T @ 2.50GHz和8GB RAM完成的。
> tm
Unit: milliseconds
expr min lq mean median uq max neval cld
Chris() 95605.92838 95674.46039 96735.74794 95742.9924 97300.65772 98858.32305 3 d
chinsoon12.cpp() 22.69009 23.07224 23.32106 23.4544 23.63655 23.81871 3 a
arg0naut() 84848.28652 85555.15312 86985.39963 86262.0197 88053.95619 89845.89267 3 c
chinsoon12.data.table() 66327.23992 66838.09245 67695.28538 67348.9450 68379.30811 69409.67124 3 b
我知道问题与data.table
有关,但是考虑到Rcpp方法的速度快了2886.251倍,我将为此解决方案提供奖励。非常感谢
完整代码:
library(Rcpp)
library(data.table)
initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT')
PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
TradePrice=seq(from=1, to=length(PriorityDateTime),by = 1)
ndf<- data.frame(PriorityDateTime,TradePrice)
ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
res <- ndf %>% data.table()
res2 <- res
setDT(ndf)
ndf2 <- ndf
chinsoon12.data.table <- function() {
ndf[, rn:=.I]
iidx <- ndf[
.(inst=InstrumentSymbol, prevMin=datetime-60L, nextMin=datetime+60L, idx=id, tp=TradePrice),
.SD[id != idx, rn[which.min(abs(TradePrice - tp))]],
by=.EACHI,
on=.(InstrumentSymbol=inst, datetime>=prevMin, datetime<=nextMin)];
ndf[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
}
arg0naut <- function() {
res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
res2, on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
, .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
}
cppFunction('NumericVector nearestPrice(NumericVector id, NumericVector datetime, NumericVector price) {
int i, j, n = id.size();
NumericVector res(n);
double prev, diff;
for (i=0; i<n; i++) {
prev = 100000;
j = i-1;
while (datetime[j] >= datetime[i]-60 && j>=0) {
diff = std::abs(price[i] - price[j]);
if (diff < prev) {
res[i] = id[j];
prev = diff;
}
j--;
}
j = i+1;
while (datetime[j] <= datetime[i]+60 && j<=n) {
diff = std::abs(price[i] - price[j]);
if (diff < prev) {
res[i] = id[j];
prev = diff;
}
j++;
}
}
return(res);
}')
chinsoon12.cpp <- function() {
setorder(ndf2, InstrumentSymbol, PriorityDateTime)
iidx <- ndf2[, nearestPrice(.I, datetime, TradePrice), by=.(InstrumentSymbol)]
ndf2[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
}
# Setup (Trivial Runtime):
Chris <- function() {
ndf[, datetime_max := datetime + 60]
ndf[, datetime_min := datetime - 60]
ndf_x <- copy(ndf)
ndf_y <- copy(ndf)
names(ndf_x) <- paste0(names(ndf),"_x")
names(ndf_y) <- paste0(names(ndf),"_y")
ndf_join <- ndf_x[ndf_y,on = .(InstrumentSymbol_x = InstrumentSymbol_y, datetime_x >= datetime_min_y, datetime_x <= datetime_max_y), mult = "all", allow.cartesian = TRUE]
ndf_join <- ndf_join[id_x != id_y]
ndf_join[, price_delta := abs(TradePrice_y - TradePrice_x)]
# Harworking Runtime:
time_now <- Sys.time()
ndf_out <- ndf_join[,.SD[which.min(price_delta), .(which_price = id_x, what_price = TradePrice_x)],
by = .(PriorityDateTime_y,TradePrice_y, id_y, InstrumentSymbol_x, datetime_y)]
}
library(microbenchmark)
tm <- microbenchmark(Chris(),
chinsoon12.cpp(),
arg0naut(),
chinsoon12.data.table(), times=3L)
ggplot2::autoplot(tm[c(2:4),])