我有一些类似的数据
dfr <- data.frame(pos=1:20,val=sample(90:120,20))
pos val
1 1 116
2 2 97
3 3 100
4 4 105
5 5 112
6 6 95
7 7 91
8 8 117
9 9 98
10 10 94
11 11 110
12 12 118
13 13 120
14 14 115
15 15 103
16 16 102
17 17 109
18 18 90
19 19 93
20 20 107
我需要计算pos中一个窗口大小的val中位数。我具有以下功能:
#' @param dfr A data.frame with columns pos and val
#' @param win An integer denoting window size
#'
fn_median <- function(dfr,win=5)
{
n <- nrow(dfr)
vec_start <- vector(length=floor(n/win),mode="numeric")
vec_end <- vector(length=floor(n/win),mode="numeric")
vec_median <- vector(length=floor(n/win),mode="numeric")
k <- 1
i <- 1
while(i<=n)
{
vec_start[k] <- dfr$pos[i]
vec_end[k] <- dfr$pos[i+(win-1)]
vec_median[k] <- median(dfr$val[i:(i+(win-1))])
k <- k+1
i <- i+win
}
return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}
返回
> fn_median(dfr,5)
start end median
1 1 5 105
2 6 10 95
3 11 15 115
4 16 20 102
基准化
library(microbenchmark)
library(ggplot2)
autoplot(microbenchmark("loop"=fn_median(dfr,5),times=1000))
此代码太慢。我如何改善它以使其更快?也许使用Apply系列功能?
答案 0 :(得分:4)
您可以使用data.table
并按pos - 1
除以5
(或其他n
)的整数除法进行分组。
library(data.table)
fn_median <- function(df, n){
setDT(df)
df[, .(start = pos[1], end = last(pos), median = median(val))
, by = .(drop = (pos - 1) %/% n)][, -'drop']
}
fn_median(dfr, 5)
# start end median
# 1: 1 5 105
# 2: 6 10 95
# 3: 11 15 115
# 4: 16 20 102
编辑:基准
library(microbenchmark)
dfr <- data.frame(pos = seq_len(1e4), val = sample(1e4))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fn_median(dfr, 5) 113.324354 131.217695 147.213517 139.283545 167.387556 188.76767 10
# fn_median2(dfr, 5) 2.896002 3.026053 4.554341 3.448822 3.687797 15.40021 10
dfr <- data.frame(pos = seq_len(1e6), val = sample(1e6))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fn_median(dfr, 5) 13295.8565 13710.4458 13729.029 13734.9328 13876.7450 14027.1664 5
# fn_median2(dfr, 5) 97.7186 103.9742 120.471 119.3268 121.1799 160.1556 5
使用的功能:
library(data.table)
fn_median2 <- function(df, n){
setDT(df)
df[, .(start = pos[1], end = last(pos), median = median(val))
, by = .(drop = (pos - 1) %/% n)][, -'drop']
}
fn_median <- function(dfr,win=5)
{
n <- nrow(dfr)
vec_start <- vector(length=floor(n/win),mode="numeric")
vec_end <- vector(length=floor(n/win),mode="numeric")
vec_median <- vector(length=floor(n/win),mode="numeric")
k <- 1
i <- 1
while(i<=n)
{
vec_start[k] <- dfr$pos[i]
vec_end[k] <- dfr$pos[i+(win-1)]
vec_median[k] <- median(dfr$val[i:(i+(win-1))])
k <- k+1
i <- i+win
}
return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}
答案 1 :(得分:2)
您可以使用Rcpp
加快循环速度。
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
DataFrame nMedianCpp(DataFrame df, int w) {
NumericVector val = df["val"];
int l = val.size() / w;
int ind = w / 2;
NumericVector res(l);
NumericVector start(l);
NumericVector end(l);
NumericVector temp(l);
for (int i = 0; i < l; i++) {
end[i] = (i + 1) * w;
start[i] = end[i] - w + 1;
temp = val[Range(start[i] - 1, end[i] - 1)];
temp.sort();
if (w % 2 == 0) {
res[i] = (temp[ind - 1] + temp[ind]) / 2;
} else {
res[i] = temp[ind];
}
}
return DataFrame::create(_["start"] = start, _["end"] = end, _["median"] = res);
}
基准:
Unit: microseconds
expr min lq mean median uq max neval
Rcpp 586.711 614.9285 784.7421 656.9605 1067.383 1262.981 100
fn_median 152008.741 153254.4405 158502.5013 154716.9210 158738.811 310708.593 100
fn_median2 2287.717 2365.5755 2544.5946 2393.2325 2423.802 8331.622 100
答案 2 :(得分:1)
data.table
解决方案,使用行组进行汇总
样本数据
dt <- fread("pos val
1 116
2 97
3 100
4 105
5 112
6 95
7 91
8 117
9 98
10 94
11 110
12 118
13 120
14 115
15 103
16 102
17 109
18 90
19 93
20 107")
代码
window <- 5
#create group-incides of window-length
dt[, group := (pos - 1) %/% window]
#and now you can (by these groups) summarise whatever you want
dt[, list(start = pos[1], end = pos[.N], median = median(val) ), by = group][, group:=NULL][]
输出
# start end median
# 1: 1 5 105
# 2: 6 10 95
# 3: 11 15 115
# 4: 16 20 102
如果您确实愿意,我想您可以将window
用作自定义函数。