如何有效地合并两个data.table
s与完全外部联接,同时处理左侧和右侧滚动最后一次向前观察(LOCF)的缺失值?
真实世界的应用 - 有两个不一定交错的交易规则信号表,X
,Y
,持有(稀疏)信号值随着时间的推移。总体目标是定义复合信号,其中 Signal.z = Signal.x AND Signal.y
X <- data.table(Instrument=rep("SPX",3)
, Date=as.IDate(c("2013-11-20","2013-11-22","2013-11-24"))
, Signal=c(TRUE,FALSE,TRUE), key=c("Instrument", "Date"))
Y <- data.table(Instrument=rep("SPX",3)
, Date=as.IDate(c("2013-11-21","2013-11-23","2013-11-25"))
, Signal=c(FALSE,TRUE,FALSE), key=c("Instrument", "Date"))
期望的结果:
Instrument Date Signal.x Signal.y Signal.z
1: SPX 2013-11-20 TRUE NA NA
2: SPX 2013-11-21 TRUE FALSE FALSE
3: SPX 2013-11-22 FALSE FALSE FALSE
4: SPX 2013-11-23 FALSE TRUE FALSE
5: SPX 2013-11-24 TRUE TRUE TRUE
6: SPX 2013-11-25 TRUE FALSE FALSE
答案 0 :(得分:5)
或许这样的事情:
dates = sort(c(X$Date, Y$Date))
setkey(X, Date)
setkey(Y, Date)
Z = X[J(dates), roll = T][,
Signal.y := Y[J(dates), roll = T]$Signal][,
Signal.z := as.logical(Signal * Signal.y)]
在此想法的基础上,这是为大型示例数据执行此操作的方法:
# assuming keys are set to Instrument, Date in both data.tables
Z = unique(setkey(rbind(setnames(X[Y, roll = T],
c("Instrument", "Date", "Signal.x", "Signal.y")),
setnames(Y[X, roll = T],
c("Instrument", "Date", "Signal.y", "Signal.x")),
use.names = TRUE),
Instrument, Date))[,
Signal.z := as.logical(Signal.x * Signal.y)]
答案 1 :(得分:3)
mnel解释了如何在data.table
包中进行完全外部联接。
这里的应用程序很简单,添加了滚动最后一个观察结果的皱纹(通过roll = TRUE
加入)。
创建一个data.table,其中包含X
或Y
中的所有(唯一)键。
## one way to do the outer join
keys <- unique(rbind(X[,key(X),with = FALSE], Y[,key(Y), with = FALSE]))
## alternate way if you have multiple data.tables to outer join
keys <- lapply(list(X,Y), function(z) z[,key(z), with = FALSE])
keys <- rbindlist(keys)
## this setkey is mostly cosmetic -
## determines whether the final output is sorted or not
setkeyv(keys, names(keys))
##cosmetic changing of column names to minimize confusion
setnames(X,"Signal","Signal.X")
setnames(Y,"Signal","Signal.Y")
## two joins, followed by the definition of the new column
X[Y[keys, roll = TRUE], roll = TRUE][,
Signal.Z := as.logical(Signal.X * Signal.Y)]
## this output is returned invisibly. either assign it or force print
.Last.value
# Instrument Date Signal.X Signal.Y Signal.Z
# 1: SPX 2013-11-20 TRUE NA NA
# 2: SPX 2013-11-21 TRUE FALSE FALSE
# 3: SPX 2013-11-22 FALSE FALSE FALSE
# 4: SPX 2013-11-23 FALSE TRUE FALSE
# 5: SPX 2013-11-24 TRUE TRUE TRUE
# 6: SPX 2013-11-25 TRUE FALSE FALSE
答案 2 :(得分:2)
我将测量三种可用解决方案的时间(Daniel.Krizian,Blue.Magister,eddi)。
为此,我创建了更大的基准数据 - 大信号表X
和Y
X
和Y
表nobs <- 5000 # number of observations for each instrument
nopps <- nobs * 3 # opportunities to trade in the time window studied
ninstr <- 200 # number of instruments
set.seed(2) # set.seed(1) generates "MPM" instrument twice :)
universe <- replicate( ninstr , paste( sample( LETTERS , 3 , repl = TRUE ), collapse = "" ) )
window <- as.Date("2013-11-26") - 1:nopps + 1
frame <- CJ(Instrument=universe, Date=rep(1:nobs))
gen.sig.tbl <- function() {
frame[, Date:= as.IDate(sample(window, size=nobs, replace=F)), by="Instrument"]
setkey(frame,Instrument,Date)
rnd.sig.sparse <- function(nobs) {
frst <- sample(c(FALSE,TRUE), 1)
rep(c(frst,!frst), nobs/2)
}
frame[, Signal:=rnd.sig.sparse(nobs), by="Instrument"]
return(copy(frame))
}
set.seed(1)
X <- gen.sig.tbl()
set.seed(2)
Y <- gen.sig.tbl()
X
Instrument Date Signal
1: AAS 1972-11-02 FALSE
2: AAS 1972-11-04 TRUE
3: AAS 1972-11-07 FALSE
4: AAS 1972-11-08 TRUE
5: AAS 1972-11-10 FALSE
---
999996: ZVH 2013-11-14 FALSE
999997: ZVH 2013-11-15 TRUE
999998: ZVH 2013-11-18 FALSE
999999: ZVH 2013-11-25 TRUE
1000000: ZVH 2013-11-26 FALSE
Y
Instrument Date Signal
1: AAS 1972-11-13 TRUE
2: AAS 1972-11-17 FALSE
3: AAS 1972-11-20 TRUE
4: AAS 1972-11-21 FALSE
5: AAS 1972-11-23 TRUE
---
999996: ZVH 2013-11-16 TRUE
999997: ZVH 2013-11-19 FALSE
999998: ZVH 2013-11-23 TRUE
999999: ZVH 2013-11-24 FALSE
1000000: ZVH 2013-11-25 TRUE
Daniel.Krizian <- function () {
Z <- merge(X, Y, all=TRUE)[, c("Signal.x","Signal.y"):=list( na.locf(Signal.x, na.rm = F)
, na.locf(Signal.y, na.rm = F))
, by=Instrument]
Z[, Signal.z := Signal.x & Signal.y]
# and the last line because (FALSE & NA) == FALSE, whereas NA result is desired
Z[, Signal.z := ifelse(is.na(Signal.x) | is.na(Signal.y), NA, Signal.z)]
return(Z)
}
Blue.Magister <- function() {
keys <- unique(rbind(X[,key(X),with = FALSE], Y[,key(Y), with = FALSE]))
## this setkey is mostly cosmetic -
## determines whether the final output is sorted or not
setkeyv(keys, names(keys))
##cosmetic changing of column names to minimize confusion
setnames(X,"Signal","Signal.X")
setnames(Y,"Signal","Signal.Y")
## two joins, followed by the definition of the new column
Z <- X[Y[keys, roll = TRUE], roll = TRUE][,
Signal.Z := as.logical(Signal.X * Signal.Y)]
Z <- unique(Z)
return(Z)
}
eddi <- function (){
# assuming keys are set to Instrument, Date in both data.tables
Z = unique(setkey(rbind(setnames(X[Y, roll = T],
c("Instrument", "Date", "Signal.x", "Signal.y")),
setnames(Y[X, roll = T],
c("Instrument", "Date", "Signal.y", "Signal.x")),
use.names = TRUE),
Instrument, Date))[,
Signal.z := as.logical(Signal.x * Signal.y)]
return(Z)
}
system.time(Z.DK <- Daniel.Krizian())
user system elapsed
2.70 0.07 3.01
system.time(Z.eddi <- eddi())
user system elapsed
1.14 0.03 1.84
system.time(Z.BM <- Blue.Magister())
user system elapsed
3.35 0.14 3.52
setnames(X,"Signal.X", "Signal") # reset original data back after Blue.Magister() call
setnames(Y,"Signal.Y", "Signal") # reset original data back after Blue.Magister() call
setnames(Z.BM
, c("Signal.X", "Signal.Y", "Signal.Z")
, c("Signal.x", "Signal.y", "Signal.z"))
identical(Z.DK, Z.BM)
TRUE
identical(Z.DK, Z.eddi)
TRUE
答案 3 :(得分:1)
我的解决方案如下;如果你知道更有效的方法,请告诉我!
Z <- merge(X, Y, all=TRUE)[, c("Signal.x","Signal.y"):=list( na.locf(Signal.x, na.rm = F)
, na.locf(Signal.y, na.rm = F))
, by=Instrument]
Z[, Signal.z := Signal.x & Signal.y]
# and the last line because (FALSE & NA) == FALSE, whereas NA result is desired
Z[, Signal.z := ifelse(is.na(Signal.x) | is.na(Signal.y), NA, Signal.z)]