在巨大的数据框中计算pvalue需要很长时间

时间:2018-08-22 08:26:17

标签: r performance dplyr

我正在尝试使用学生t检验在长数据格式的巨大数据框中计算p.values。由于我的原始数据帧在数据帧中大约有几行,因此p.values的计算花费很长时间(大约100分钟)。

我正在尝试加快处理速度,但是我不确定数据框是否是提高速度的最佳格式,或者我是否应该重塑数据并可能使用matrix

这是一个可重现的示例,带有少量数据框和最后一个基准。

library(dplyr)

my.t.test <- function (x, y = NULL) {
  nx <- length(x)
  mx <- mean(x)
  vx <- var(x)
  ny <- length(y)
  my <- mean(y)
  vy <- var(y)
  stderrx <- sqrt(vx/nx)
  stderry <- sqrt(vy/ny)
  stderr <- sqrt(stderrx^2 + stderry^2)
  df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
  tstat <- (mx - my - 0)/stderr
  pval <- 2 * pt(-abs(tstat), df)
  return(pval)
}

cont <- c("A", "B")
set.seed(1)
df1 <- data.frame(id=rep(1:1000, each=8),
                  replicate=1:4,
                  A=rnorm(8000, mean=26, sd=5),
                  B=rnorm(8000, mean=25, sd=7))

completeDF <- function() {
  df1 %>%
  group_by(id) %>%
  summarise(Comparison=paste(cont, collapse=' - '),
            p.value=t.test(get(cont[1]), get(cont[2]))$p.value,
            log10.p.value=-log10(p.value),
            log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
  )}
noPvalue <- function() {
  df1 %>%
    group_by(id) %>%
    summarise(Comparison=paste(cont, collapse=' - '),
              log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
    )}
myPvalue <- function() {
  df1 %>%
    group_by(id) %>%
    summarise(Comparison=paste(cont, collapse=' - '),
              p.value=my.t.test(get(cont[1]), get(cont[2])),
              log10.p.value=-log10(p.value),
              log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
    )}
microbenchmark::microbenchmark(
  completeDF(), noPvalue(), myPvalue()
)

我的基准:

Unit: milliseconds
         expr       min        lq      mean    median        uq      max neval
 completeDF() 358.38330 365.09423 424.60255 369.20453 377.40354 655.2009   100
   noPvalue()  57.42996  58.89978  81.86222  59.66851  60.96582 337.2346   100
   myPvalue() 216.04812 220.98277 318.09568 224.19516 493.74908 609.4516   100

因此,由于我的t.test功能大大减少(没有测试等),我已经节省了一些时间。但是我想知道是否可以通过某种方式向量化进一步改善这一点。

1 个答案:

答案 0 :(得分:6)

均值和方差计算需要按组进行,但t检验和p值计算可以向量化。

my.t.test.2 <- function(grp, x, y) {
    grp <- factor(grp)

    x_g <- split(x, grp)
    x_n <- lengths(x_g)
    x_mean <- vapply(x_g, mean, numeric(1))
    x_var <- vapply(x_g, var, numeric(1))

    y_g <- split(y, grp)
    y_n <- lengths(y_g)
    y_mean <- vapply(y_g, mean, numeric(1))
    y_var <- vapply(y_g, var, numeric(1))

    x_se2 <- x_var / x_n
    y_se2 <- y_var / y_n
    se <- sqrt(x_se2 + y_se2)
    tstat <- (x_mean - y_mean) / se
    df <- se^4 / (x_se2^2 / (x_n - 1L) + (y_se2^2) / (y_n - 1L))

    2 * pt(-abs(tstat), df)
}

通过避免调度(“原因”会降低mean())并最小化冗余计算(例如每个组的长度),可以尝试并变得非常聪明。

my.t.test.2.1 <- compiler::cmpfun(function(grp, x, y) {
    grp <- factor(grp)

    x_g <- split.default(x, grp)
    n <- lengths(x_g)
    n1 <- n - 1L
    x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
    x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)

    y_g <- split.default(y, grp)
    y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
    y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)

    x_se2 <- x_var / n
    y_se2 <- y_var / n
    se <- sqrt(x_se2 + y_se2)
    tstat <- (x_mean - y_mean) / se
    df <- se^4 / ((x_se2^2 + y_se2^2) / n1)

    2 * pt(-abs(tstat), df)
})

可以将标准解决方案和其他解决方案包装起来以提供相同的输出

f0 <- function(df)
    df %>% group_by(id) %>% summarize(p.value = t.test(A, B)$p.value)

f1 <- function(df)
    df %>% group_by(id) %>% summarize(p.value = my.t.test(A, B))

f2 <- function(df)
    tibble(id = unique(df$id), p.value = my.t.test.2(df$id, df$A, df$B))

f2.1 <- function(df)
    tibble(id = unique(df$id), p.value = my.t.test.2.1(df$id, df$A, df$B))

f2.1()产生与规范实现相同的结果,并且速度快大约两倍。担心mean()等的速度(f2()f2.1())的速度似乎被误导了

> all.equal.default(f0(df1), f2.1(df1))
[1] TRUE
> microbenchmark(f0(df1), f1(df1), f2(df1), f2.1(df1), times = 5)
Unit: milliseconds
      expr      min       lq     mean   median       uq      max neval
   f0(df1) 374.2819 379.7749 380.8365 380.0094 381.2368 388.8794     5
   f1(df1) 249.6502 250.2525 251.8813 252.1965 253.3444 253.9630     5
   f2(df1) 154.1152 158.3243 159.8277 159.1076 162.7602 164.8311     5
 f2.1(df1) 151.0032 151.0149 152.3900 152.8105 153.2840 153.8373     5

对我来说C ++实现

my.t.test.cpp <- function (x, y = NULL) {
    nx <- length(x)
    mx <- sum_cpp(x) / nx
    vx <- var_cpp(x, mx)
    ny <- length(y)
    my <- sum_cpp(y) / ny
    vy <- var_cpp(y, my)
    stderrx <- sqrt(vx/nx)
    stderry <- sqrt(vy/ny)
    stderr <- sqrt(stderrx^2 + stderry^2)
    df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
    tstat <- (mx - my - 0)/stderr
    pval <- 2 * pt(-abs(tstat), df)
    return(pval)
}

fcpp <- function(df)
    df %>% group_by(id) %>% summarize(p.value = my.t.test.cpp(A, B))

在大约100毫秒内产生与规范和时钟相等的结果。

对2.1解决方案进行概要分析表明,大部分时间都花在var()内部,其中有对stopifnot()的调用以及与参数匹配的调用

> var
function (x, y = NULL, na.rm = FALSE, use) 
{
    ...
    na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs", 
        "everything", "na.or.complete"))
    ...
    if (is.data.frame(x)) 
        x <- as.matrix(x)
    else stopifnot(is.atomic(x))
    ... 
    .Call(C_cov, x, y, na.method, FALSE)
}
<bytecode: 0x5e1a440>
<environment: namespace:stats>

> Rprof(); x <- my.t.test.2.1(df1$id, df1$A, df1$B); Rprof(NULL); summaryRprof()
$by.self
                      self.time self.pct total.time total.pct
"withCallingHandlers"      0.04    28.57       0.08     57.14
"tryCatchList"             0.04    28.57       0.04     28.57
"vapply"                   0.02    14.29       0.14    100.00
"stopifnot"                0.02    14.29       0.12     85.71
"match.call"               0.02    14.29       0.02     14.29

$by.total
                      total.time total.pct self.time self.pct
"vapply"                    0.14    100.00      0.02    14.29
"my.t.test.2.1"             0.14    100.00      0.00     0.00
"stopifnot"                 0.12     85.71      0.02    14.29
"FUN"                       0.12     85.71      0.00     0.00
"withCallingHandlers"       0.08     57.14      0.04    28.57
"tryCatchList"              0.04     28.57      0.04    28.57
"tryCatch"                  0.04     28.57      0.00     0.00
"match.call"                0.02     14.29      0.02    14.29

$sample.interval
[1] 0.02

$sampling.time
[1] 0.14

因此,追求速度时,可能会避免参数检查并直接调用C函数

my.t.test.2.2 <- compiler::cmpfun(function(grp, x, y) {
    var <- function(x)
        .Call(stats:::C_cov, x, NULL, 4L, FALSE)
    grp <- factor(grp)

    x_g <- split.default(x, grp)
    n <- lengths(x_g)
    n1 <- n - 1L
    x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
    x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)

    y_g <- split.default(y, grp)
    y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
    y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)

    x_se2 <- x_var / n
    y_se2 <- y_var / n
    se <- sqrt(x_se2 + y_se2)
    tstat <- (x_mean - y_mean) / se
    df <- se^4 / ((x_se2^2 + y_se2^2) / n1)

    2 * pt(-abs(tstat), df)
})

f2.2 <- function(df)
    tibble(id = unique(df$id), p.value = my.t.test.2.2(df$id, df$A, df$B))

这确实表现不错。

> all.equal.default(f0(df1), f2.2(df1))
[1] TRUE
> microbenchmark(
+     f0(df1), f1(df1), f2(df1), f2.1(df1), f2.2(df1), fcpp(df1),
+     times = 5
+ )
Unit: milliseconds
      expr       min        lq      mean    median       uq       max neval
   f0(df1) 378.61985 379.25525 393.38371 379.56797 386.2806 443.19488     5
   f1(df1) 250.99802 252.45281 253.55140 253.34249 255.2801 255.68362     5
   f2(df1) 156.76073 158.63126 159.63693 160.33446 161.2260 161.23216     5
 f2.1(df1) 146.64555 148.28773 151.17250 151.38536 153.9363 155.60751     5
 f2.2(df1)  25.24441  25.62982  27.50898  26.11755  30.0836  30.46951     5
 fcpp(df1) 104.20851 104.50396 105.19383 104.62905 104.7876 107.84006     5

我们可以使用C ++实现方差计算,而不用调用R的计算

my.t.test.2.2.cpp <- compiler::cmpfun(function(grp, x, y) {
    grp <- factor(grp)

    x_g <- split.default(x, grp)
    n <- lengths(x_g)
    n1 <- n - 1L
    x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
    x_var <- unlist(Map(var_cpp, x_g, x_mean))

    y_g <- split.default(y, grp)
    y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
    y_var <- unlist(Map(var_cpp, y_g, y_mean))

    x_se2 <- x_var / n
    y_se2 <- y_var / n
    se <- sqrt(x_se2 + y_se2)
    tstat <- (x_mean - y_mean) / se
    df <- se^4 / ((x_se2^2 + y_se2^2) / n1)

    2 * pt(-abs(tstat), df)
})

f2.2.cpp <- function(df)
    tibble(id = unique(df$id), p.value = my.t.test.2.2.cpp(df$id, df$A, df$B))

具有可比的性能

> microbenchmark(f2.2(df1), f2.2.cpp(df1), times = 20)
Unit: milliseconds
          expr      min       lq     mean   median       uq      max neval
     f2.2(df1) 25.11237 25.69622 30.27956 26.35570 29.81884 87.34955    20
 f2.2.cpp(df1) 24.88787 25.25171 26.80836 25.43498 29.06338 30.80012    20

我不确定哪个更像是破解-为差异编写自己的C ++代码,或直接调用R的C代码。

更快的C ++解决方案可在一次调用中计算组均值和方差

cppFunction('List doit(IntegerVector group, NumericVector x) {
  int n_grp = 0;
  for (int i = 0; i < group.size(); ++i)
      n_grp = group[i] > n_grp ? group[i] : n_grp;

  std::vector<int> n(n_grp);
  std::vector<double> sum(n_grp), sumsq(n_grp);
  for (int i = 0; i < group.size(); ++i) {
      n[ group[i] - 1 ] += 1;
      sum[ group[i] - 1 ] += x[i];
      sumsq[ group[i] - 1 ] += x[i] * x[i];
  }
  NumericVector mean(n_grp), var(n_grp);
  for (size_t i = 0; i < n.size(); ++i) {
      mean[i] = sum[i] / n[i];
      var[i] = (sumsq[i] - sum[i] * mean[i]) / (n[i] - 1);
  }
  return List::create(_["n"]=n[0], _["mean"]=mean, _["var"]=var);
}')

my.t.test.2.3.cpp <- compiler::cmpfun(function(grp, x, y) {
    x <- doit(grp, x)
    y <- doit(grp, y)

    x_se2 <- x$var / x$n
    y_se2 <- y$var / y$n
    se <- sqrt(x_se2 + y_se2)
    tstat <- (x$mean - y$mean) / se
    df <- se^4 / ((x_se2^2 + y_se2^2) / (x$n - 1L))

    2 * pt(-abs(tstat), df)
})

f2.3.cpp <- function(df)
    tibble(
        id = unique(df$id),
        p.value = my.t.test.2.3.cpp(df$id, df$A, df$B)
    )

这很快

> all.equal.default(f0(df1), f2.3.cpp(df1))
[1] TRUE
> microbenchmark(f2.2(df1), f2.2.cpp(df1), f2.3.cpp(df1), times = 50)
Unit: milliseconds
          expr       min        lq      mean    median        uq       max
     f2.2(df1) 24.743364 25.445833 28.032135 25.873117 29.191020 88.642771
 f2.2.cpp(df1) 24.122380 24.867212 26.012985 25.369963 25.897866 30.783544
 f2.3.cpp(df1)  2.831635  2.946094  3.101408  2.992049  3.073788  7.191572
 neval
    50
    50
    50
> 

另一种替代方法是需要一个矩阵的生物导体包装genefilter ::rowttests()

set.seed(1)
m1 <- cbind(
    matrix(rnorm(8000, mean = 26, sd = 5), ncol=8, byrow = TRUE),
    matrix(rnorm(8000, mean = 25, sd = 7), ncol=8, byrow = TRUE)
)

f4 <- function(m1)
    genefilter::rowttests(m1, factor(rep(1:2, each=8)))

而且速度快

> microbenchmark(f2.3.cpp(df1), f4(m1), times=50)
Unit: milliseconds
          expr      min       lq     mean   median       uq      max neval
 f2.3.cpp(df1) 2.760877 2.796542 2.877030 2.845795 2.895441 3.286143    50
        f4(m1) 1.335288 1.359007 1.397601 1.377544 1.412606 1.693340    50

(有些区别在于创建小标题)。