Question

我有以下数据框（称为mydata_tsample）：

cusip_id     trd_exctn_dt   trd_exctn_tm    price   contra_party_type **refPrice**
BUHADU       01.04.2016     01:10:50        101.00  C                 102.10
BUHADU       01.04.2016     02:10:50        101.50  C                 102.10    
BUHADU       01.04.2016     08:10:50        102.10  D                 102.10
BUHADU       01.04.2016     09:10:50        102.10  C                 102.10
BUHADU       02.04.2016     07:12:50        90.50   C                 90.85
BUHADU       02.04.2016     09:10:55        90.85   D                 90.85
BUHADU       02.04.2016     12:11:40        90.90   C                 91.00
BUHADU       02.04.2016     12:12:02        91.00   D                 91.00
XDSEOI       03.04.2016     06:52:51        50.00   D                 50.00 
XDSEOI       03.04.2016     08:40:58        50.20   C                 50.00  
XDSEOI       03.04.2016     15:10:51        51.00   C                 52.00
XDSEOI       03.04.2016     15:14:51        52.00   D                 52.00

我想生成/添加一个使用for循环计算的新列（称为refPrice）。对于RefPrice列中的每一行，我想在以下条件下提取价格：

相同cusip_ID
相同trd_exctn_dt
contra_party_type = D
然后以最接近的价格trd_exctn_tm

我做了一个完全符合这个要求的代码：

for (i in 1:nrow(mydata_tsample)){
      Mtx_aftr_CUSIP=mydata_tsample[mydata_tsample$cusip_id %in% mydata_tsample[i,1],]
      Mtx_aftr_CUSIP_dt=Mtx_aftr_CUSIP[Mtx_aftr_CUSIP$trd_exctn_dt %in% mydata_tsample[i,2],]
      Mtx_aftr_CUSIP_dt_dealer=Mtx_aftr_CUSIP_dt[Mtx_aftr_CUSIP_dt$contra_party_type %in% "D",]
      if(nrow(Mtx_aftr_CUSIP_dt_dealer)==0) {next} else 
      {
        closesttime=which.min(abs(Mtx_aftr_CUSIP_dt_dealer$trd_exctn_tm - mydata_tsample[i,3]))
        mydata_tsample$RefPrice[i]=Mtx_aftr_CUSIP_dt_dealer[closesttime,4]  }
}

我遇到的问题是速度。我花了几个小时来处理0.5Mio。线。总共我有5Mio。线...

我尝试了doParallel，但我没有成功。

library(doParallel)
registerDoParallel(cores=4)
library(foreach)
foreach(i=1:nrow(mydata_tsample)) %dopar% {
  Mtx_aftr_CUSIP=mydata_tsample[mydata_tsample$cusip_id %in% mydata_tsample[i,1],]
  Mtx_aftr_CUSIP_dt=Mtx_aftr_CUSIP[Mtx_aftr_CUSIP$trd_exctn_dt %in% mydata_tsample[i,2],]
  Mtx_aftr_CUSIP_dt_dealer=Mtx_aftr_CUSIP_dt[Mtx_aftr_CUSIP_dt$contra_party_type %in% "D",]
  if(nrow(Mtx_aftr_CUSIP_dt_dealer)==0) {next} else 
  {
    closesttime=which.min(abs(Mtx_aftr_CUSIP_dt_dealer$trd_exctn_tm - mydata_tsample[i,3]))
    mydata_tsample$RefPrice[i]=Mtx_aftr_CUSIP_dt_dealer[closesttime,4]
  }
}

}

Answer 1

这是一个非常快的解决方案，它使用data.table的滚动连接，在500000行上只需几毫秒：

数据：

dt <- fread("cusip_id trd_exctn_dt trd_exctn_tm price contra_party_type BUHADU 01.04.2016 01:10:50 101.00 C BUHADU 01.04.2016 02:10:50 101.50 C BUHADU 01.04.2016 08:10:50 102.10 D BUHADU 01.04.2016 09:10:50 102.10 C BUHADU 02.04.2016 07:12:50 90.50 C BUHADU 02.04.2016 09:10:55 90.85 D BUHADU 02.04.2016 12:11:40 90.90 C BUHADU 02.04.2016 12:12:02 91.00 D XDSEOI 03.04.2016 06:52:51 50.00 D XDSEOI 03.04.2016 08:40:58 50.20 C XDSEOI 03.04.2016 15:10:51 51.00 C XDSEOI 03.04.2016 15:14:51 52.00 D XDSEOI 03.04.2016 23:59:00 58.00 D XDSEOI 04.04.2016 01:00:00 52.00 C XDSEOI 04.04.2016 15:14:51 55.00 D")

<强>代码：

library(data.table) library(lubridate) # Convert trd_exctn_tm to number of seconds (or create a new column) dt[, trd_exctn_tm := as.numeric(hms(x = trd_exctn_tm)),] # set keys setkey(dt, cusip_id, trd_exctn_dt, trd_exctn_tm) # keep rollin rollin rollin... dt[contra_party_type == "D", .(cusip_id, trd_exctn_dt, trd_exctn_tm, RefPrice=price),][dt,, roll = "nearest"]

<强>输出：

cusip_id trd_exctn_dt trd_exctn_tm RefPrice price contra_party_type 1: BUHADU 01.04.2016 4250 102.10 101.00 C 2: BUHADU 01.04.2016 7850 102.10 101.50 C 3: BUHADU 01.04.2016 29450 102.10 102.10 D 4: BUHADU 01.04.2016 33050 102.10 102.10 C 5: BUHADU 02.04.2016 25970 90.85 90.50 C 6: BUHADU 02.04.2016 33055 90.85 90.85 D 7: BUHADU 02.04.2016 43900 91.00 90.90 C 8: BUHADU 02.04.2016 43922 91.00 91.00 D 9: XDSEOI 03.04.2016 24771 50.00 50.00 D 10: XDSEOI 03.04.2016 31258 50.00 50.20 C 11: XDSEOI 03.04.2016 54651 52.00 51.00 C 12: XDSEOI 03.04.2016 54891 52.00 52.00 D 13: XDSEOI 03.04.2016 86340 58.00 58.00 D 14: XDSEOI 04.04.2016 3600 55.00 52.00 C 15: XDSEOI 04.04.2016 54891 55.00 55.00 D

<强>说明：

data.table操作的第一部分

dt[contra_party_type == "D", .(cusip_id, trd_exctn_dt, trd_exctn_tm, RefPrice=price),]

可以翻译为

选择dt，子行contra... == "D"，选择列cusip_id，...和RefPrice等于price。

所以这个data.table看起来像

cusip_id trd_exctn_dt trd_exctn_tm RefPrice 1: BUHADU 01.04.2016 08:10:50 102.10 2: BUHADU 02.04.2016 09:10:55 90.85 3: BUHADU 02.04.2016 12:12:02 91.00 4: XDSEOI 03.04.2016 06:52:51 50.00 5: XDSEOI 03.04.2016 15:14:51 52.00 6: XDSEOI 03.04.2016 23:59:00 58.00 7: XDSEOI 04.04.2016 15:14:51 55.00

将其保存为dt2并使用setkey(dt, cusip_id, trd_exctn_dt, trd_exctn_tm)设置相同的键，我们可以转到命令的第二部分：

dt2[dt,, roll = "nearest"]

为了理解将其改为

dt2[dt,,]

并查看结果。您可以看到我们通过key列加入了两个表格。 RefPrice已添加dt。但NA中有RefPrice个，因为dt2中找不到这些行。为了摆脱这些NA，我们使用roll = "nearest"，意思是根据RefPrice在dt2中取trd_exctn_tm的最接近值并填充这些行

Answer 2

这是一个简单的部分解决方案，在几秒钟内运行，并获得最近的上一个价格contra_party_type=="D"。

# generate toy data:
library(dplyr)
library(zoo)
n <- 500000
dfr <- dplyr::tibble(
  cusip_id = sample(LETTERS, n, replace = TRUE),
  trd_exctn_dt = as.Date(sample(365, n, replace = TRUE), 
    origin = "2016-01-01"),
  trd_exctn_tm = strftime(as.POSIXlt(sample(60*60*24, n, replace = TRUE),
    origin = "1970-01-01"), "%H:%M:%S"),
  price = round(rnorm(n, 100, 5), 2),
  contra_party_type = sample(LETTERS[1:4], n, replace = TRUE)
)


dfr <- dfr %>% 
      group_by(cusip_id, trd_exctn_dt) %>% 
      arrange(trd_exctn_tm, .by_group = TRUE) %>% 
      mutate(
        refprice = ifelse(contra_party_type == "D", price, NA),
        refprice = zoo::na.locf(refprice, na.rm = FALSE)
      )
dfr

# A tibble: 500,000 x 6
# Groups:   cusip_id, trd_exctn_dt [9,490]
   cusip_id trd_exctn_dt trd_exctn_tm price contra_party_type refprice
   <chr>    <date>       <chr>        <dbl> <chr>                <dbl>
 1 A        2016-01-02   00:25:47      89.6 D                     89.6
 2 A        2016-01-02   01:19:37     101.  B                     89.6
 3 A        2016-01-02   01:22:34     108.  B                     89.6
 4 A        2016-01-02   01:28:14     102.  D                    102. 
 5 A        2016-01-02   01:35:36      95.9 A                    102. 
 6 A        2016-01-02   01:45:01     102.  C                    102.

要做到你想要的，我会

计算上一个实例的时差，其中cpt为D
计算到cpt为D
计算每个实例的价格
使用ifelse

根据另一列按组创建最接近值的新列

2 个答案: