我有一个大型数据集,正在使用data.table来识别每个分组ID的第一个非缺失值。
我目前可以通过定义一个函数并使用lapply()
将其应用于整个数据帧来实现这一目标。我也尝试使用mclapply()
,但这似乎更慢。
### Libraries ###
library(microbenchmark)
library(ggplot2)
library(data.table)
### Dummy Data Table ###
dt <- data.table(
id = rep(1:4, each = 4),
var_int = c(rep(NA, 3), 1L, rep(NA, 2), 10L, rep(NA, 2), 100L, rep(NA, 2), 1000L, rep(NA, 3)),
var_dou = c(rep(NA, 2), 1, rep(NA, 2), 1.01, rep(NA, 2), 1.001, rep(NA, 3), rep(NA, 3), 1.0001),
var_cha = c(NA, "a", rep(NA, 2), "b", rep(NA, 6), "c", rep(NA, 2), "d", NA),
var_intmi = c(1L, rep(NA, 14), 4L)
)
dt
## id var_int var_dou var_cha var_intmi
## 1: 1 NA NA <NA> 1
## 2: 1 NA NA a NA
## 3: 1 NA 1.0000 <NA> NA
## 4: 1 1 NA <NA> NA
## 5: 2 NA NA b NA
## 6: 2 NA 1.0100 <NA> NA
## 7: 2 10 NA <NA> NA
## 8: 2 NA NA <NA> NA
## 9: 3 NA 1.0010 <NA> NA
## 10: 3 100 NA <NA> NA
## 11: 3 NA NA <NA> NA
## 12: 3 NA NA c NA
## 13: 4 1000 NA <NA> NA
## 14: 4 NA NA <NA> NA
## 15: 4 NA NA d NA
## 16: 4 NA 1.0001 <NA> 4
### Functions ###
firstnonmiss_1 <- function(x){x[which(complete.cases(x))][1]}
firstnonmiss_2 <- function(x){first(x[complete.cases(x)])}
firstnonmiss_3 <- function(x){x[complete.cases(x)][1]}
### Desired Output ###
dt[, lapply(.SD, firstnonmiss_3), by = id]
## id var_int var_dou var_cha var_intmi
## 1: 1 1 1.0000 a 1
## 2: 2 10 1.0100 b NA
## 3: 3 100 1.0010 c NA
## 4: 4 1000 1.0001 d 4
### Benchmarking ###
t <- microbenchmark(
"which()[1]" = dt[, lapply(.SD, firstnonmiss_1), by = id],
"first()" = dt[, lapply(.SD, firstnonmiss_2), by = id],
"[1]" = dt[, lapply(.SD, firstnonmiss_3), by = id],
times = 1e4
)
t
## Unit: microseconds
## expr min lq mean median uq max neval
## which()[1] 414.438 426.8485 516.7795 437.9710 460.8930 161388.83 10000
## first() 401.574 413.6190 483.2857 424.6860 446.6475 41523.61 10000
## [1] 388.845 401.4700 468.9951 411.3505 432.2035 33320.75 10000
### Plot Outputs ###
units <- attributes(print(t))[["unit"]]
autoplot(t) +
labs(x = "Function", y = paste0("Timings, (", units, ")")) +
scale_x_discrete() +
scale_y_log10() +
geom_violin(fill = "skyblue", alpha = 0.5) +
theme_light() +
theme(axis.text.y = element_text(family = "Monaco", angle = 90, hjust = 0.5))
虚拟数据集上的基准时间还不错,但是当我在实际数据集(1,019列,1,506,451行,502,540组ID)上运行该函数时,大约需要11分钟才能完成。是否有更好/更快的方法来获取折叠的数据框,该数据框包含每个列/变量的每个组ID的第一个非缺失观测值?
答案 0 :(得分:2)
您可能要考虑使用Rcpp
来减少NA检查的次数:
library(Rcpp)
cppFunction('
NumericVector firstNonNA(NumericMatrix M) {
NumericVector res(M.ncol());
for (int j=0; j<M.ncol(); j++) {
res[j] = NA_REAL;
for (int i=0; i<M.nrow(); i++) {
if (!Rcpp::traits::is_na<REALSXP>(M(i, j))) {
res[j] = M(i, j);
break;
}
}
}
return res;
}
')
#create sample data
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
grp=rep(1:ngrp, each=avgNr))
dim(DT)
#[1] 3000 1001
#use Rcpp function
system.time(DT[, as.list(firstNonNA(as.matrix(.SD))), by=grp])
定时输出:
user system elapsed
5.59 0.08 5.63
不幸的是,没有RAM可以测试实际的暗淡
答案 1 :(得分:2)
在这种情况下,每组只有3个结果时,熔化数据集和转换的速度会更快。
使用@ chinsoon12的数据集,使用OP的原始解决方案可以获得2-3秒的时间,而使用融化和浇铸则可以达到0.4秒。如果您不介意保持数据熔融(即长时间),则大约需要0.2秒,比原始数据快10倍。
#melt and cast
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)
#only melt
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]
#approach with intermediate variables:
molten_DT<- na.omit(melt(DT, id.vars = 'grp'), 'value')
dcast(molten_DT[molten_DT[, .I[1], by = .(grp, variable)]$V1, ], grp ~ variable)
library(data.table)
library(microbenchmark)
#@chinsoon12's dataset
set.seed(0L)
ngrp <- 1000L #502540
avgNr <- 3L
nc <- 1000L #1019
DT <- data.table(
as.data.table(matrix(sample(c(NA,1), ngrp*avgNr*nc, TRUE), nrow=ngrp*avgNr, ncol=nc)),
grp=rep(1:ngrp, each=avgNr))
system.time(DT[, lapply(.SD, firstnonmiss_1), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_2), by = grp])
system.time(DT[, lapply(.SD, firstnonmiss_3), by = grp])
microbenchmark(melt_and_cast = {
dcast(melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)], grp ~ variable)
},melt_1 = {
melt(DT, id.vars = 'grp')[!is.na(value), .SD[1], by = .(grp, variable)]
}
,times = 20)
答案 2 :(得分:0)
为了帮助那些将来可能会迷路的人,这就是我使用@Cole的答案为每个分组ID查找每个变量的第一个非缺失值的方法:
## Character Vars ##
vars_char <- names(dt)[sapply(dt, is.character)]
dt_char <- melt(dt,
id.vars = "id",
measure.vars = vars_char,
na.rm = T)
dt_char <- dt_char[, .SD[1], by = .(id, variable)]
dt_char <- dcast(dt_char, id ~ variable)
## Integer Vars ##
vars_int <- names(dt)[sapply(dt, is.integer)]
vars_int <- vars_int[vars_int != "id"]
dt_int <- melt(dt,
id.vars = "id",
measure.vars = vars_int,
na.rm = T)
dt_int <- dt_int[, .SD[1], by = .(id, variable)]
dt_int <- dcast(dt_int, id ~ variable)
## Double Vars ##
vars_doub <- names(dt)[sapply(dt, is.double)]
dt <- melt(dt,
id.vars = "id",
measure.vars = vars_doub,
na.rm = T)
dt <- dt[, .SD[1], by = .(id, variable)]
dt <- dcast(dt, id ~ variable)
## Combine Variables Types ##
dt <- Reduce(function(x, y){merge(x, y, by = "id", all = T)}, list(dt_int, dt, dt_char))
以上内容分为三部分,以避免与所有值都强制转换为字符类型相关的内存问题。如果这不是您的数据集的问题,则可以使用以下方法:
dt <- melt(dt,
id.vars = "id",
na.rm = T)
dt <- dt[, .SD[1], by = .(id, variable)]
dt <- dcast(dt, id ~ variable)
对于初始示例数据集,其运行时间比任何firstnonmiss()
函数都要长得多。
### Benchmarking ###
t <- microbenchmark(
"which()[1]" = dt[, lapply(.SD, firstnonmiss_1), by = id],
"first()" = dt[, lapply(.SD, firstnonmiss_2), by = id],
"[1]" = dt[, lapply(.SD, firstnonmiss_3), by = id],
"reshape" = dcast(melt(dt, id.vars = "id", na.rm = T)[, .SD[1], by = .(id, variable)], id ~ variable),
times = 1e4
)
t
## Unit: microseconds
## expr min lq mean median uq max neval
## which()[1] 416.199 434.8970 497.6187 447.8205 471.3300 19577.46 10000
## first() 400.774 421.4570 472.8580 434.2320 458.2420 31315.78 10000
## [1] 389.710 408.6455 464.6562 421.2085 442.8305 17822.18 10000
## reshape 2052.353 2120.1925 2400.9130 2178.8150 2285.6500 96451.59 10000
units <- attributes(print(t))[["unit"]]
autoplot(t) +
labs(x = "Function", y = paste0("Timings, (", units, ")")) +
scale_x_discrete() +
scale_y_log10() +
geom_violin(fill = "skyblue", alpha = 0.5) +
theme_light() +
theme(axis.text.y = element_text(family = "Monaco", angle = 90, hjust = 0.5))