我想将NA
值替换为data.table
中的最后一个非NA值并使用data.table
。我有一个解决方案,但它比na.locf
慢得多:
library(data.table)
library(zoo)
library(microbenchmark)
f1 <- function(x) {
x[, X := na.locf(X, na.rm = F)]
x
}
f2 <- function(x) {
cond <- !is.na(x[, X])
x[, X := .SD[, X][1L], by = cumsum(cond)]
x
}
m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 100))
m2 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 100))
microbenchmark(f1(m1), f2(m2), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# f1(m1) 2.648938 2.770792 2.959156 3.894635 6.032533 10
# f2(m2) 994.267610 1916.250440 1926.420436 1941.401077 2008.929024 10
我想知道,为什么它如此缓慢以及是否存在更快的解决方案。
答案 0 :(得分:4)
以下是仅data.table
解决方案,但它比na.locf
略慢:
m1[, X := X[1], by = cumsum(!is.na(X))]
m1
# X
# 1: NA
# 2: NA
# 3: 1
# 4: 2
# 5: 2
# ---
# 996: 2
# 997: 2
# 998: 6
# 999: 7
#1000: 8
速度测试:
m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 1e6))
f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
system.time(f1(copy(m1)))
# user system elapsed
# 3.84 0.58 4.62
system.time(f3(copy(m1)))
# user system elapsed
# 5.56 0.19 6.04
这是让它变得更快的一种不正常的方式,但我认为这使得它的可读性大大降低:
f4 = function(x) {
x[, tmp := cumsum(!is.na(X))]
setattr(x, "sorted", "tmp") # set the key without any checks
x[x[!is.na(X)], X := i.X][, tmp := NULL]
}
system.time(f4(copy(m1)))
# user system elapsed
# 3.32 0.51 4.00
答案 1 :(得分:4)
正如我在comment中提到的那样,Rcpp
对此非常快。下面我比较zoo::na.locf
方法,@ eddi的f3
和f4
,以及@RomainFrancois发布的Rcpp
方法here。
首先,基准测试结果:
microbenchmark(f.zoo(m1), eddi.f3(m2), eddi.f4(m3), f.Rcpp(m4), times = 10)
## Unit: milliseconds
## expr min lq median uq max neval
## f.zoo(m1) 1297.969 1403.67418 1443.5441 1527.7644 1597.9724 10
## eddi.f3(m2) 2982.103 2998.48809 3039.6543 3068.9303 3078.3963 10
## eddi.f4(m3) 1970.650 2017.55740 2061.6599 2074.1497 2099.8892 10
## f.Rcpp(m4) 95.411 98.44505 107.6925 119.2838 171.7855 10
功能定义:
library(data.table)
library(zoo)
library(microbenchmark)
library(Rcpp)
m1 <- m2 <- m3 <- m4 <-
data.table(X = rep(c(NA, NA, 1, 2, NA, NA, NA, 6, 7, 8), 1e6))
f.zoo <- function(x) {
x[, X := na.locf(X, na.rm = F)]
x
}
eddi.f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
eddi.f4 = function(x) {
x[, tmp := cumsum(!is.na(X))]
setattr(x, "sorted", "tmp")
x[x[!is.na(X)], X := i.X][, tmp := NULL]
}
# Make the Cpp function available
cppFunction('
NumericVector naLocfCpp(NumericVector x) {
double *p=x.begin(), *end = x.end() ;
double v = *p ; p++ ;
while( p < end ){
while( p<end && !NumericVector::is_na(*p) ) p++ ;
v = *(p-1) ;
while( p<end && NumericVector::is_na(*p) ) {
*p = v ;
p++ ;
}
}
return x;
}')
f.Rcpp <- function(x) {
naLocfCpp(x$X)
x
}
所有产生相同的结果:
out1 <- f.zoo(m1)
out2 <- eddi.f3(m2)
out3 <- eddi.f4(m3)
out4 <- f.Rcpp(m4)
all(identical(out1, out2), identical(out1, out3), identical(out1, out4))
## TRUE