如何加快R循环

时间:2019-02-15 13:09:26

标签: r

我有一些类似的数据

dfr <- data.frame(pos=1:20,val=sample(90:120,20))

   pos val
1    1 116
2    2  97
3    3 100
4    4 105
5    5 112
6    6  95
7    7  91
8    8 117
9    9  98
10  10  94
11  11 110
12  12 118
13  13 120
14  14 115
15  15 103
16  16 102
17  17 109
18  18  90
19  19  93
20  20 107

我需要计算pos中一个窗口大小的val中位数。我具有以下功能:

#' @param dfr A data.frame with columns pos and val
#' @param win An integer denoting window size
#'
fn_median <- function(dfr,win=5)
{
  n <- nrow(dfr)
  vec_start <- vector(length=floor(n/win),mode="numeric")
  vec_end <- vector(length=floor(n/win),mode="numeric")
  vec_median <- vector(length=floor(n/win),mode="numeric")
  k <- 1
  i <- 1
  while(i<=n)
  {
    vec_start[k] <- dfr$pos[i]
    vec_end[k] <- dfr$pos[i+(win-1)]
    vec_median[k] <- median(dfr$val[i:(i+(win-1))])
    k <- k+1
    i <- i+win
  }

  return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}

返回

> fn_median(dfr,5)
  start end median
1     1   5    105
2     6  10     95
3    11  15    115
4    16  20    102

基准化

library(microbenchmark)
library(ggplot2)

autoplot(microbenchmark("loop"=fn_median(dfr,5),times=1000))

enter image description here

此代码太慢。我如何改善它以使其更快?也许使用Apply系列功能?

3 个答案:

答案 0 :(得分:4)

您可以使用data.table并按pos - 1除以5(或其他n)的整数除法进行分组。

library(data.table)
fn_median <- function(df, n){
  setDT(df)
  df[, .(start = pos[1], end = last(pos), median = median(val))
      , by = .(drop = (pos - 1) %/% n)][, -'drop']
}

fn_median(dfr, 5)

#    start end median
# 1:     1   5    105
# 2:     6  10     95
# 3:    11  15    115
# 4:    16  20    102

编辑:基准

library(microbenchmark)
dfr <- data.frame(pos = seq_len(1e4), val = sample(1e4))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 10)
# Unit: milliseconds
#                expr        min         lq       mean     median         uq       max neval
#   fn_median(dfr, 5) 113.324354 131.217695 147.213517 139.283545 167.387556 188.76767    10
#  fn_median2(dfr, 5)   2.896002   3.026053   4.554341   3.448822   3.687797  15.40021    10

dfr <- data.frame(pos = seq_len(1e6), val = sample(1e6))
microbenchmark(fn_median(dfr, 5), fn_median2(dfr, 5), times = 5)
# Unit: milliseconds
#                expr        min         lq      mean     median         uq        max neval
#   fn_median(dfr, 5) 13295.8565 13710.4458 13729.029 13734.9328 13876.7450 14027.1664     5
#  fn_median2(dfr, 5)    97.7186   103.9742   120.471   119.3268   121.1799   160.1556     5

使用的功能:

library(data.table)
fn_median2 <- function(df, n){
  setDT(df)
  df[, .(start = pos[1], end = last(pos), median = median(val))
      , by = .(drop = (pos - 1) %/% n)][, -'drop']
}



fn_median <- function(dfr,win=5)
{
  n <- nrow(dfr)
  vec_start <- vector(length=floor(n/win),mode="numeric")
  vec_end <- vector(length=floor(n/win),mode="numeric")
  vec_median <- vector(length=floor(n/win),mode="numeric")
  k <- 1
  i <- 1
  while(i<=n)
  {
    vec_start[k] <- dfr$pos[i]
    vec_end[k] <- dfr$pos[i+(win-1)]
    vec_median[k] <- median(dfr$val[i:(i+(win-1))])
    k <- k+1
    i <- i+win
  }

  return(data.frame(start=vec_start,end=vec_end,median=vec_median))
}

答案 1 :(得分:2)

您可以使用Rcpp加快循环速度。

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
DataFrame nMedianCpp(DataFrame df, int w) {

  NumericVector val = df["val"];
  int l = val.size() / w;
  int ind = w / 2;
  NumericVector res(l);
  NumericVector start(l);
  NumericVector end(l);
  NumericVector temp(l);

  for (int i = 0; i < l; i++) {

    end[i] = (i + 1) * w;
    start[i] = end[i] - w + 1;
    temp = val[Range(start[i] - 1, end[i] - 1)];
    temp.sort();
    if (w % 2 == 0) {
      res[i] = (temp[ind - 1] + temp[ind]) / 2;
    } else {
      res[i] = temp[ind];
    }
  }
  return DataFrame::create(_["start"] = start, _["end"] = end, _["median"] = res);
}

基准:

Unit: microseconds
       expr        min          lq        mean      median         uq        max neval
       Rcpp    586.711    614.9285    784.7421    656.9605   1067.383   1262.981   100
  fn_median 152008.741 153254.4405 158502.5013 154716.9210 158738.811 310708.593   100
 fn_median2   2287.717   2365.5755   2544.5946   2393.2325   2423.802   8331.622   100

enter image description here

答案 2 :(得分:1)

data.table解决方案,使用行组进行汇总

样本数据

dt <- fread("pos val
1 116
2  97
3 100
4 105
5 112
6  95
7  91
8 117
9  98
10  94
11 110
12 118
13 120
14 115
15 103
16 102
17 109
18  90
19  93
20 107")

代码

window <- 5
#create group-incides of window-length
dt[, group := (pos - 1) %/% window]
#and now you can (by these groups) summarise whatever you want
dt[, list(start = pos[1], end = pos[.N], median = median(val) ), by = group][, group:=NULL][]

输出

#    start end median
# 1:     1   5    105
# 2:     6  10     95
# 3:    11  15    115
# 4:    16  20    102

如果您确实愿意,我想您可以将window用作自定义函数。