Question

我有一个大型数据表（10亿行×50列），类似于if( isEmptyL0(*r0LhPtr) ) { // list is empty => head and tail point to the same and only element *r0LhPtr = *r0LtPtr = newPtr; } else { // append new element at tail, new element is new tail (*r0LtPtr)->nextPtr = newPtr; *r0LtPtr = newPtr; }中的void RemoveFromQueue(RunZeroLandPtr *r0LhPtr, RunZeroLandPtr *r0LtPtr, int MinsForFuel) { RunZeroLandPtr tempPtr; RunZeroLandPtr currentPtr; if ( isEmptyL0(*r0LhPtr) ) return; if (MinsForFuel == (*r0LhPtr)->MinsForFuel) { // head element will be removed tempPtr = *r0LhPtr; *r0LhPtr = (*r0LhPtr)->nextPtr; if ( *r0LtPtr == tempPtr ) // there was only one element *r0LtPtr = *r0LhPtr; free( tempPtr ); return; } currentPtr = *r0LhPtr; while ( currentPtr->nextPtr != NULL && MinsForFuel != currentPtr->nextPtr->MinsForFuel ) { currentPtr = currentPtr->nextPtr; } if ( currentPtr->nextPtr != NULL ) { // the next element is the element you want to remove tempPtr = currentPtr->nextPtr; currentPtr->nextPtr = currentPtr->nextPtr->nextPtr ; if ( *r0LtPtr == tempPtr ) // the removed elemtent is tail *r0LtPtr = currentPtr; free( tempPtr ); } return; }，其中可以组合多个列以形成日期。

我目前用于创建此日期时间列的代码是：

The "Module" DXL Type represents a loaded **ModuleVersion** (current or Baseline).

对于flights数据，大约需要0.6秒。有没有办法改善这种表现？我主要对时间感兴趣;内存使用是次要问题。

这是一个候选数据表：

library(nycflights13)

Answer 1

我使用lubridate和stringr来提高航班数据的性能提升25％。不幸的是，我目前没有能够处理与全套数据集一样大的数据集的计算机，所以希望它可以扩展。

library(data.table)
library(nycflights13)
library(fasttime)
library(microbenchmark)
library(lubridate)
library(stringr)

flights <- as.data.table(flights)

op1 <- microbenchmark(
  flights[,DepDateTime := fastPOSIXct(paste0(year, 
                                             "-",
                                             formatC(month, width = 2, format = "d", flag = "0"),
                                             "-",
                                             formatC(day, width = 2, format = "d", flag = "0"), 
                                             " ",
                                             # replace e.g. 903 with '09:03:00'
                                             gsub("([0-9]{2})([0-9]{2})", "\\1:\\2:00", 
                                                  formatC(dep_time, width = 4, 
                                                          format = "d", flag = "0")))
  )],
  times=50L)

op2 <- microbenchmark(
  flights[,DepDateTime := ymd_hm(paste(year, 
                                       month, 
                                       day, 
                                       str_pad(dep_time,
                                               width = 4,
                                               side = "left",
                                               pad = "0"), 
                                       sep = "-"))],
  times=50L)

我的电脑上的基准是

 >op1
      min       lq     mean   median       uq      max neval
 3.385542 3.526347 3.739545 3.679273 3.855418 4.594314    50
>op2
      min       lq     mean   median       uq      max neval
 2.536882 2.589711 2.733829 2.715038 2.835111 3.194575    50

Answer 2

通过在函数（sprintf）中使用连接和create_fn来实现相当大的速度提升。对于较小的数据集，增加的幅度较小：

library(data.table)
library(nycflights13)
library(fasttime)
library(microbenchmark)
library(ggplot2) # for autoplot

create_DepDateTime <- function(DT){
  setkey(DT, year, month, day, dep_time)
  unique_dates <- unique(DT[,list(year, month, day, dep_time)])
  unique_dates[,DepDateTime := fastPOSIXct(sprintf("%d-%02d-%02d %s", year, 
                                                   month, 
                                                   day, 
                                                   sub("([0-9]{2})([0-9]{2})", 
                                                       "\\1:\\2:00",
                                                       sprintf("%04d", dep_time), 
                                                       perl = TRUE)), 
                                           tz = "GMT")]
  DT[unique_dates]
}

flights <- as.data.table(flights)

BENCHMARK <- function(){
  flights[,DepDateTime := fastPOSIXct(paste0(year, 
                                             "-",
                                             formatC(month, width = 2, 
                                                     format = "d", flag = "0"),
                                             "-",
                                             formatC(day, width = 2, 
                                                     format = "d", flag = "0"), 
                                             " ",
                                             # replace e.g. 903 with '09:03:00'
                                             gsub("([0-9]{2})([0-9]{2})", 
                                                  "\\1:\\2:00", 
                                                  formatC(dep_time, 
                                                          width = 4, 
                                                          format = "d", 
                                                          flag = "0")))
  )]
}

NGaffney_lubridate <- function(){
  flights[,DepDateTime := lubridate::ymd_hm(paste(year, 
                                                  month, 
                                                  day, 
                                                  stringr::str_pad(dep_time,
                                                                   width = 4,
                                                                   side = "left",
                                                                   pad = "0"), 
                                                  sep = "-"))]
}
create_fn <- function(){
  flights <- create_DepDateTime(flights)
}

autoplot(
  microbenchmark(
  BENCHMARK(),
  NGaffney_lubridate(),
  create_fn(),
  times=50L
  )
)

更高效的快速使用

2 个答案: