我有一个大型数据表(10亿行×50列),类似于 if( isEmptyL0(*r0LhPtr) )
{
// list is empty => head and tail point to the same and only element
*r0LhPtr = *r0LtPtr = newPtr;
}
else
{
// append new element at tail, new element is new tail
(*r0LtPtr)->nextPtr = newPtr;
*r0LtPtr = newPtr;
}
中的void RemoveFromQueue(RunZeroLandPtr *r0LhPtr, RunZeroLandPtr *r0LtPtr, int MinsForFuel)
{
RunZeroLandPtr tempPtr;
RunZeroLandPtr currentPtr;
if ( isEmptyL0(*r0LhPtr) )
return;
if (MinsForFuel == (*r0LhPtr)->MinsForFuel)
{
// head element will be removed
tempPtr = *r0LhPtr;
*r0LhPtr = (*r0LhPtr)->nextPtr;
if ( *r0LtPtr == tempPtr ) // there was only one element
*r0LtPtr = *r0LhPtr;
free( tempPtr );
return;
}
currentPtr = *r0LhPtr;
while ( currentPtr->nextPtr != NULL && MinsForFuel != currentPtr->nextPtr->MinsForFuel )
{
currentPtr = currentPtr->nextPtr;
}
if ( currentPtr->nextPtr != NULL )
{
// the next element is the element you want to remove
tempPtr = currentPtr->nextPtr;
currentPtr->nextPtr = currentPtr->nextPtr->nextPtr ;
if ( *r0LtPtr == tempPtr ) // the removed elemtent is tail
*r0LtPtr = currentPtr;
free( tempPtr );
}
return;
}
,其中可以组合多个列以形成日期。
我目前用于创建此日期时间列的代码是:
The "Module" DXL Type represents a loaded **ModuleVersion** (current or Baseline).
对于flights
数据,大约需要0.6秒。有没有办法改善这种表现?我主要对时间感兴趣;内存使用是次要问题。
这是一个候选数据表:
library(nycflights13)
答案 0 :(得分:2)
我使用lubridate
和stringr
来提高航班数据的性能提升25%。不幸的是,我目前没有能够处理与全套数据集一样大的数据集的计算机,所以希望它可以扩展。
library(data.table)
library(nycflights13)
library(fasttime)
library(microbenchmark)
library(lubridate)
library(stringr)
flights <- as.data.table(flights)
op1 <- microbenchmark(
flights[,DepDateTime := fastPOSIXct(paste0(year,
"-",
formatC(month, width = 2, format = "d", flag = "0"),
"-",
formatC(day, width = 2, format = "d", flag = "0"),
" ",
# replace e.g. 903 with '09:03:00'
gsub("([0-9]{2})([0-9]{2})", "\\1:\\2:00",
formatC(dep_time, width = 4,
format = "d", flag = "0")))
)],
times=50L)
op2 <- microbenchmark(
flights[,DepDateTime := ymd_hm(paste(year,
month,
day,
str_pad(dep_time,
width = 4,
side = "left",
pad = "0"),
sep = "-"))],
times=50L)
我的电脑上的基准是
>op1
min lq mean median uq max neval
3.385542 3.526347 3.739545 3.679273 3.855418 4.594314 50
>op2
min lq mean median uq max neval
2.536882 2.589711 2.733829 2.715038 2.835111 3.194575 50
答案 1 :(得分:0)
通过在函数(sprintf
)中使用连接和create_fn
来实现相当大的速度提升。对于较小的数据集,增加的幅度较小:
library(data.table)
library(nycflights13)
library(fasttime)
library(microbenchmark)
library(ggplot2) # for autoplot
create_DepDateTime <- function(DT){
setkey(DT, year, month, day, dep_time)
unique_dates <- unique(DT[,list(year, month, day, dep_time)])
unique_dates[,DepDateTime := fastPOSIXct(sprintf("%d-%02d-%02d %s", year,
month,
day,
sub("([0-9]{2})([0-9]{2})",
"\\1:\\2:00",
sprintf("%04d", dep_time),
perl = TRUE)),
tz = "GMT")]
DT[unique_dates]
}
flights <- as.data.table(flights)
BENCHMARK <- function(){
flights[,DepDateTime := fastPOSIXct(paste0(year,
"-",
formatC(month, width = 2,
format = "d", flag = "0"),
"-",
formatC(day, width = 2,
format = "d", flag = "0"),
" ",
# replace e.g. 903 with '09:03:00'
gsub("([0-9]{2})([0-9]{2})",
"\\1:\\2:00",
formatC(dep_time,
width = 4,
format = "d",
flag = "0")))
)]
}
NGaffney_lubridate <- function(){
flights[,DepDateTime := lubridate::ymd_hm(paste(year,
month,
day,
stringr::str_pad(dep_time,
width = 4,
side = "left",
pad = "0"),
sep = "-"))]
}
create_fn <- function(){
flights <- create_DepDateTime(flights)
}
autoplot(
microbenchmark(
BENCHMARK(),
NGaffney_lubridate(),
create_fn(),
times=50L
)
)