我正在处理一些大型时间序列数据集,每个文件中有大约200万行。到目前为止,我一直在使用ddply来聚合数据,就像我想要的那样,但不幸的是它变得太慢了,我真的需要一种更快的方式。这是我的代码:
DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
DF <- DF[which(DF$DateTime != 0),]
options(digits.secs = 3)
DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")
completecase <- DF[complete.cases(DF),]
midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
**# creating 10 millisecond time intervals**
cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
**#creating new Time variable with the 10 millisecond breaks**
midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
**#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
mp$Time = as.POSIXct(mp$Time)
mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
mp_xts = makeReturns(mp_xts)
ddply太慢了。我知道我可以使用data.table包或dplyr包来做到这一点,但我似乎无法弄清楚是复制.drop = FALSE参数。其他软件包都会丢弃所有空的时间间隔,但对我来说保留它们以进行分析非常重要。
我要做的基本上是每10毫秒聚合一次我的时间序列,并用0或locf填充NA时间帧。所以,如果它做得对,我的输出应该有6.25 * 60 * 60 * 100 = 2,250,000行。
任何想法如何在与ddply完全相同的情况下更快地完成这项工作?
> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806,
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808,
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813,
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82,
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85,
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L,
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L,
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L,
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L,
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L,
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L,
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L,
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L,
84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L,
84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L,
82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L,
82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L,
84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L,
84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L,
82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L,
82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L,
87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L,
84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L,
82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L),
BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L,
85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L,
84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1",
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2",
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4",
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5",
"AO5"), row.names = c(NA, 20L), class = "data.frame")
如果我发布任何其他信息并且非常感谢您的帮助,请告诉我。
> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)
locale:
[1] LC_CTYPE=en_IN.UTF-8 LC_NUMERIC=C LC_TIME=en_IN.UTF-8 LC_COLLATE=en_IN.UTF-8 LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] highfrequency_0.4 xts_0.9-7 zoo_1.7-12 data.table_1.9.7
loaded via a namespace (and not attached):
[1] tools_3.2.2 grid_3.2.2 lattice_0.20-33
答案 0 :(得分:3)
最新的development version data.table
添加了非等连接,为这些问题提供了非常紧凑和快速的解决方案:
dt = as.data.table(df) # or convert in place using setDT
breaks = dt[, {tmp = seq(min(DateTime) - 0.01, max(DateTime) + 0.01, 0.01);
.(Start = tmp, End = c(tail(tmp, -1), Inf))}];
dt[breaks, on = .(DateTime >= Start, DateTime < End),
lapply(.SD, mean), by = .EACHI]
答案 1 :(得分:2)
除了尝试在Rcpp中编写自己的C ++实现之外,不能比data.table做得更好。无论如何,这是一个data.table解决方案,使用findInterval()
代替cut()
,这应该稍快一些。另外,我已经在min和max之外扩展了休息时间以获得一些额外的空间隔来进行测试。
library(data.table);
setDT(DF);
breaks <- seq.POSIXt(min(DF$DateTime)-0.01,max(DF$DateTime)+0.01,0.01);
breaks;
## [1] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [4] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST"
## [7] "2015-12-20 22:45:00 EST"
diff(breaks);
## Time differences in secs
## [1] 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999
ints <- findInterval(DF$DateTime,breaks);
ints;
## [1] 2 2 2 2 2 2 2 2 2 2 3 3 3 3 4 6 6 6 6 6
agg <- DF[,lapply(.SD,mean),breaks[ints],.SDcols=-1L];
agg;
## breaks Seq BP1 BQ1 BO1 AP1 AQ1 AO1 BP2 BQ2 BO2 AP2 AQ2 AO2 BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4 BP5 BQ5 BO5 AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
emp <- setdiff(seq_along(breaks),ints);
emp;
## [1] 1 5 7
agg <- rbind(fill=T,agg,data.table(breaks=breaks[emp]));
agg;
## breaks Seq BP1 BQ1 BO1 AP1 AQ1 AO1 BP2 BQ2 BO2 AP2 AQ2 AO2 BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4 BP5 BQ5 BO5 AP5 AQ5 AO5
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0
## 5: 2015-12-20 22:45:00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6: 2015-12-20 22:45:00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 7: 2015-12-20 22:45:00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA