如何按固定的卷大小汇总股票市场数据?

时间:2014-07-29 04:17:19

标签: r time-series xts quantitative-finance

目标:按库存间隔5000股分割股票市场数据

数据格式:日期,时间,价格,数量

我的代码在100万行的数据帧上真的很慢,有没有更快的方法呢? 我已经包含了我的代码和我使用的数据集。 谢谢你的帮助!

我的代码:

# read data
data1<-read.table(text=ZZ,sep=',',header=T)
colnames(data1)<-c("Date","Time","Price","Volume")

#create column
data1[,"volBinIdx"]<-NA

#create index
volBin<-1
sumVol<-0

#create cutting for each volume bin
for(i in 1:nrow(data1))
{
  sumVol<-sumVol + data1[i,"Volume"]
  if (sumVol<= 5000) {
    data1[i,"volBinIdx"]<-volBin
  } else {
    volBin<-(volBin+1)
    data1[i,"volBinIdx"]<-volBin
    sumVol<-data1[i,'Volume']
  }
}

#aggregate data by volBinIdx
a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))

#create a data frame
x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
colnames(x3)<-c("Date","Time","Open","High","Low","Close")

我的数据集:

ZZ<-"
Date,Time,Price,Size
02/07/2014,09:30:01,3,500
02/07/2014,09:30:29,3,42
02/07/2014,09:35:56,3,100
02/07/2014,09:37:17,3,100
02/07/2014,09:37:28,3.2,900
02/07/2014,09:37:35,3.2,4900
02/07/2014,09:37:51,3.2,1000
02/07/2014,09:42:11,3.2,500
02/07/2014,10:00:31,3,2400
02/07/2014,10:00:37,3.2,500
02/07/2014,10:00:44,3.2,3347
02/07/2014,10:07:33,3.2,1000
02/07/2014,10:31:42,3.24,1000
02/07/2014,10:33:44,3.24,200
02/07/2014,10:40:28,3.25,300
02/07/2014,10:49:57,3.25,600
02/07/2014,10:53:16,3.25,100
02/07/2014,10:53:32,3.4,1000
02/07/2014,10:54:13,3.4,500
02/07/2014,11:05:37,3.35,1000
02/07/2014,11:11:29,3.25,600
02/07/2014,11:15:26,3.3,60
02/07/2014,11:19:16,3.3,23
02/07/2014,11:21:14,3.25,100
02/07/2014,11:21:22,3.25,100
02/07/2014,11:21:30,3.2,500
02/07/2014,11:21:35,3.2,500
02/07/2014,11:21:43,3.2,500
02/07/2014,11:29:58,3.1,200
02/07/2014,11:35:42,3.19,360
02/07/2014,11:39:51,3.19,1000
02/07/2014,11:52:39,3.15,200
02/07/2014,11:53:51,3.15,100
02/07/2014,11:55:11,3.2,100
02/07/2014,12:17:32,3.2,1500
02/07/2014,12:35:42,3.24,1200
02/07/2014,12:37:53,3.24,100
02/07/2014,12:38:02,3.24,3500
02/07/2014,12:53:57,3.24,400
02/07/2014,13:10:57,3.239,100
02/07/2014,13:11:35,3.24,800
02/07/2014,13:13:41,3.24,1000
02/07/2014,13:39:40,3.24,450
02/07/2014,13:56:04,3.24,500
02/07/2014,14:09:49,3.24,600
02/07/2014,14:11:25,3.24,1000
02/07/2014,14:25:53,3.24,25
02/07/2014,14:30:58,3.24,30
02/07/2014,14:31:36,3.24,30
02/07/2014,14:32:12,3.24,30
02/07/2014,14:33:00,3.24,100
02/07/2014,14:34:49,3.24,1100
02/07/2014,14:36:02,3.24,2000
02/07/2014,14:37:07,3.22,1500
02/07/2014,14:42:30,3.22,3300
02/07/2014,14:42:46,3.22,100
02/07/2014,14:42:54,3.2,1000
02/07/2014,14:53:13,3.23,240
02/07/2014,14:53:27,3.24,500
02/07/2014,14:53:59,3.24,60
02/07/2014,14:54:46,3.2,1500
02/07/2014,14:57:45,3.2,160
02/07/2014,14:57:46,3.2,125
02/07/2014,14:57:54,3.2,100
02/07/2014,15:05:56,3.19,100
02/07/2014,15:22:21,3.19,300
02/07/2014,15:22:28,3.18,150
02/07/2014,15:23:09,3.19,2000
02/07/2014,15:35:23,3.18,1500
02/07/2014,15:44:36,3.18,600
02/10/2014,09:30:02,3.25,100
02/10/2014,09:30:02,3.25,25
02/10/2014,09:30:24,3.25,150
02/10/2014,09:30:40,3.25,100
02/10/2014,09:31:11,3.25,650
02/10/2014,09:35:32,3.24,200
02/10/2014,09:37:59,3.19,100
02/10/2014,09:38:01,3.2,2000
02/10/2014,09:38:09,3.18,185
02/10/2014,09:38:36,3.18,500
02/10/2014,09:39:13,3.18,1042
02/10/2014,09:39:18,3.18,156
02/10/2014,09:39:18,3.17,20
02/10/2014,09:41:24,3.15,100
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:41,3.15,500
02/10/2014,09:42:57,3.15,100
02/10/2014,09:43:24,3.12,500
02/10/2014,09:43:29,3.12,100
02/10/2014,09:43:32,3.1,5000
02/10/2014,09:44:02,3.1,500
02/10/2014,09:44:19,3.1,500
02/10/2014,09:44:22,3.09,100
02/10/2014,09:44:22,3.09,96
02/10/2014,09:44:55,3.05,100
02/10/2014,09:45:11,3.05,676
02/10/2014,09:45:23,3,150
02/10/2014,09:45:44,2.95,1000
02/10/2014,09:45:53,2.95,1500
02/10/2014,09:47:17,2.95,100
02/10/2014,09:47:46,2.9,100
02/10/2014,09:48:24,2.9,500
02/10/2014,09:48:50,2.9,100
02/10/2014,09:49:11,2.85,386
02/10/2014,09:49:13,2.85,100
02/10/2014,09:49:14,2.8,200
02/10/2014,09:49:15,2.7,100
02/10/2014,09:49:22,2.7,100
02/10/2014,09:49:32,2.7,100
02/10/2014,09:50:09,2.65,2500
02/10/2014,09:50:44,2.66,2500
02/10/2014,09:50:49,2.6,100
02/10/2014,09:50:53,2.7,240
02/10/2014,09:50:54,2.61,1000
02/10/2014,09:50:58,2.65,414
02/10/2014,09:55:24,2.95,100
02/10/2014,09:57:22,2.95,400
02/10/2014,10:07:21,2.95,400
02/10/2014,10:16:28,2.95,250
02/10/2014,10:21:20,2.85,300
02/10/2014,10:32:40,2.94,100
02/10/2014,10:33:18,2.95,426
02/10/2014,10:33:38,2.95,70
02/10/2014,10:33:39,2.94,1900
02/10/2014,10:43:46,2.95,4500
02/10/2014,10:44:00,2.99,200
02/10/2014,10:44:20,2.99,505
02/10/2014,10:49:30,2.96,500
02/10/2014,10:57:22,2.95,2500
02/10/2014,10:57:25,2.95,500
02/10/2014,10:57:40,2.95,500
02/10/2014,11:38:29,3,500
02/10/2014,11:38:35,3.05,500
02/10/2014,11:38:45,3.1,1000
02/10/2014,11:45:08,3.05,100
02/10/2014,11:49:55,3.01,100
02/10/2014,11:50:14,3,1900
02/10/2014,11:50:18,3,100
02/10/2014,12:07:51,3,1000
02/10/2014,12:33:26,3,400
02/10/2014,13:57:20,3.1,150
02/10/2014,13:57:34,3,42
02/10/2014,14:21:42,3.15,500
02/10/2014,14:23:35,3.15,1000
02/10/2014,14:25:40,3.05,200
02/10/2014,14:26:01,3.15,100
02/10/2014,14:50:50,3.15,100
02/10/2014,14:51:00,3.1,100
02/10/2014,14:51:09,3.1,100
02/10/2014,14:51:24,3.05,500
02/10/2014,14:51:43,3,100
02/10/2014,14:52:04,2.95,100
02/10/2014,14:52:15,2.99,25
02/10/2014,14:52:17,2.95,100
02/10/2014,14:52:33,2.9,500
02/10/2014,14:52:47,2.95,600
02/10/2014,14:52:49,2.85,100
02/10/2014,14:52:51,2.85,1000
02/10/2014,14:53:08,2.82,500
02/10/2014,14:53:24,2.85,500
02/10/2014,14:53:43,2.84,5400
02/10/2014,14:53:48,2.85,100
02/10/2014,15:00:48,2.99,64
02/10/2014,15:04:08,2.99,412
02/10/2014,15:11:42,2.99,100
02/10/2014,15:11:46,2.99,100
02/10/2014,15:12:06,2.99,100
02/10/2014,15:20:35,3.04,500
02/10/2014,15:30:28,3,500
02/10/2014,15:36:58,2.95,2000
02/10/2014,15:38:09,3,550
02/10/2014,15:39:48,2.97,2000
02/11/2014,09:30:04,3.2,100
02/11/2014,09:30:18,3.2,2000
02/11/2014,10:03:07,3.18,1000
02/11/2014,10:21:35,3.18,26
02/11/2014,10:27:09,3.15,500
02/11/2014,10:37:22,3.15,1108
02/11/2014,10:37:22,3.15,1054
02/11/2014,10:37:23,3.1,100
02/11/2014,10:42:26,3.05,1000
02/11/2014,10:42:57,3.02,1000
02/11/2014,10:43:29,3.02,1000
02/11/2014,10:48:27,3.02,100
02/11/2014,10:50:36,3.01,1000
02/11/2014,10:51:33,3.01,1000
02/11/2014,10:51:43,3.01,1000
02/11/2014,10:52:17,3.01,1000
02/11/2014,10:53:55,3.01,500
02/11/2014,10:54:31,3.05,40
02/11/2014,10:55:41,3.01,100
02/11/2014,10:55:44,3,3300
02/11/2014,10:55:44,3,100
02/11/2014,10:55:44,3,5000
02/11/2014,10:55:44,3,230
02/11/2014,10:56:21,3,100
02/11/2014,11:01:20,3,100
02/11/2014,11:01:21,3,50
02/11/2014,11:17:30,2.99,600
02/11/2014,11:17:34,3,500
02/11/2014,11:18:49,2.99,3000
02/11/2014,11:25:55,3.03,500
02/11/2014,11:29:59,2.99,400
02/11/2014,11:30:08,2.99,100
02/11/2014,11:30:18,2.99,100
02/11/2014,11:30:46,2.99,200
02/11/2014,11:38:48,2.95,100
02/11/2014,11:44:55,2.98,325
02/11/2014,12:32:09,3,500
02/11/2014,12:32:55,3,50
02/11/2014,13:15:49,3.1,1000
02/11/2014,14:16:16,3.05,350
02/11/2014,14:29:12,2.99,650
02/11/2014,14:32:23,2.99,335
02/11/2014,14:32:29,2.99,500
02/11/2014,15:25:01,3,1000
02/11/2014,15:49:37,3,500
02/11/2014,15:51:08,2.98,300
02/12/2014,08:46:23,3,1500
02/12/2014,09:10:01,3,2000
02/12/2014,09:21:31,3.1,1500
02/12/2014,09:26:33,3.2,2000
02/12/2014,09:27:58,3.2,2500
02/12/2014,09:30:00,3.2,2000
02/12/2014,09:30:00,3.2,10000
02/12/2014,09:30:01,3.2,500
02/12/2014,09:30:02,3.2,30
02/12/2014,09:30:18,3.2,30
02/12/2014,09:40:51,3.05,100
02/12/2014,09:40:52,3.05,1250
02/12/2014,09:41:01,3.05,806
02/12/2014,09:41:11,3,100
02/12/2014,09:43:48,2.98,1000
02/12/2014,09:44:22,3,4000
02/12/2014,09:44:27,2.98,1000
02/12/2014,09:44:31,2.98,2900
02/12/2014,09:47:43,2.98,110
02/12/2014,09:50:49,2.96,100
02/12/2014,09:50:51,2.8,750
02/12/2014,09:51:11,2.95,100
02/12/2014,09:55:35,2.95,1050
02/12/2014,09:55:56,2.95,100
02/12/2014,09:56:29,3,100"

1 个答案:

答案 0 :(得分:2)

这可能不是最佳代码,但它至少是大约4倍的改进。我仍然使用for循环但是用向量替换了一些data.frame子集并且用dplyr聚合

library(dplyr)
library(microbenchmark)

microbenchmark(
  original = {
    data1<-read.table(text=ZZ,sep=',',header=T)
    colnames(data1)<-c("Date","Time","Price","Volume")

    #create column
    data1[,"volBinIdx"]<-NA

    #create index
    volBin<-1
    sumVol<-0

    #create cutting for each volume bin
    for(i in 1:nrow(data1))
    {
      sumVol<-sumVol + data1[i,"Volume"]
      if (sumVol<= 5000) {
        data1[i,"volBinIdx"]<-volBin
      } else {
        volBin<-(volBin+1)
        data1[i,"volBinIdx"]<-volBin
        sumVol<-data1[i,'Volume']
      }
    }

    #aggregate data by volBinIdx
    a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
    a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
    a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))

    #create a data frame
    x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
    colnames(x3)<-c("Date","Time","Open","High","Low","Close")
  },

  beginneR = {

    data1<-read.table(text=ZZ,sep=',',header=T)
    colnames(data1)<-c("Date","Time","Price","Volume")

    #create index
    volBin<-1
    sumVol<-0
    Volume <- data1$Volume
    volBinIdx <- numeric(nrow(data1))

    #create cutting for each volume bin

    for(i in seq_len(nrow(data1))){
      sumVol <- sumVol + Volume[i]
      if (sumVol <= 5000) {
        volBinIdx[i] <- volBin
      } else {
        volBinIdx[i] <-  volBin <- volBin + 1
        sumVol <- Volume[i]
      }
    }

    data1 <- data1 %>%
      mutate(volBinIdx = volBinIdx) %>%
      group_by(volBinIdx) %>%
      summarize(Date = head(Date, 1),
                Time = head(Time, 1),
                Open = head(Price, 1),
                High = max(Price),
                Low = min(Price),
                Close = tail(Price, 1)) %>% 
      select(-volBinIdx)

  }, unit = "relative")

#    Unit: relative
#    expr      min      lq   median       uq       max neval
#original 4.180704 4.24341 4.254675 4.129769 0.7706553   100
#beginneR 1.000000 1.00000 1.000000 1.000000 1.0000000   100