我有一个很大的data.frame
结构:
min.reps <- 1
max.reps <- 3
set.seed(1)
df <- do.call(rbind,lapply(1:100, function(i) {
reps <- seq(1,as.integer(runif(1,min.reps, max.reps)), 1)
vals <- runif(length(reps), 0, 100)
return(data.frame(id=rep(i,length(reps)),rep=reps,val=vals,stringsAsFactors=F))
}))
head(df)
id rep val
1 1 1 37.212390
2 2 1 90.820779
3 2 2 20.168193
4 3 1 94.467527
5 3 2 66.079779
6 4 1 6.178627
每个df$id
都有min.reps
和max.reps
个观察点(df$val
)。实际上,我有~500,000个ID,而不是100个ID。
对于每个df$id
,我想添加一个值,从mean
和sd
作为median
和{{1分别超过其现有值。
这样做很简单:
mad
但我想知道,鉴于我的真实add.reps <- 1
all.ids <- unique(df$id)
require(dplyr)
new.df <- do.call(rbind, lapply(all.ids, function(i) {
id.df <- dplyr::filter(df, id == i)
add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
的维度,是否有更快的方法来实现这一目标。
答案 0 :(得分:2)
这应该快得多:
add.reps <- 1
do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
答案 1 :(得分:0)
好的,到目前为止:
require(microbenchmark)
microbenchmark(
new.df <- do.call(rbind, lapply(all.ids, function(i) {
id.df <- dplyr::filter(df, id == i)
add.df <- rbind(id.df, data.frame(id = rep(i,add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F))
}))
)
new.df <- do.call(rbind, lapply(all.ids, function(i) { id.df <- dplyr::filter(df, id == i) add.df <- rbind(id.df, data.frame(id = rep(i, add.reps), rep = max(id.df$rep) + add.reps, val = rnorm(add.reps, median(id.df$val), mad(id.df$val)), stringsAsFactors = F)) }))
min lq mean median uq max neval
212.9906 225.1345 371.9314 260.9686 332.5619 1621.586 100
VS
microbenchmark(
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x,
data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps,
val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
)
new.df <- do.call(rbind, lapply(split(df, df$id), function(x) rbind(x, data.frame(id = rep(unique(x$id), add.reps), rep = max(x$rep) + add.reps, val = rnorm(add.reps, median(x$val), mad(x$val)), stringsAsFactors = F))))
min lq mean median uq max neval
133.8357 135.1846 202.9654 137.2722 160.5121 1401.03 100
我想知道这是否还能进一步改善