与r中的cooccur包函数并行化

时间:2015-12-02 09:41:40

标签: r parallel-processing

我正在使用cooccur包计算非常庞大的数据集中物种的同时发生。 该软件包基于概率模型,该模型在计算方面要求很高。

因此,我想知道如何将计算并行化以获得更快的结果。 我已经看到像doParallelsnowfall之类的软件包可以完成这项工作但是我尝试使用它们并且因为它们需要循环结构而没有真正成功。

install.packages("cooccur")
library(cooccur)
data(finches)
system.time(
  co <- cooccur(finches, thresh = FALSE, spp_names = TRUE)
)

通过这个例子,计算速度很快,但是对于更大的数据集,计算速度非常慢。

请注意,在Ubuntu上,coocur包依赖于需要gmp的{​​{1}}。

1 个答案:

答案 0 :(得分:0)

看起来如果你想并行化这个函数,你必须跳入函数本身,看看哪些(如果有的话)嵌套循环可以拆开。那里有/吨/圈。

哪些嵌套循环导致您遇到的问题最多(并且应该并行化)可能取决于您的特定问题和特定数据集。为帮助诊断问题,请考虑使用hadley的分析功能(如下所示)来帮助识别可能重写功能的位置。请记住,您可能希望使用相对大量的数据运行性能分析测试(和速度测试),以便找到合适的修剪位置。此时,您还应该考虑它是worth the time

library(cooccur)
library(devtools)
library(lineprof)
data(finches)
devtools::install_github("hadley/lineprof")
l <- lineprof(co <- cooccur(finches, thresh = FALSE, spp_names = TRUE))
shine(l)

首先,您可能需要查看大1:nrow(obs_coocur)循环。在使用雀科数据集的测试中,我无法提高速度,结果似乎有些退化(需要清理大量NA行,即使结果也不相同)。

以下废弃的草稿功能:

mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) 
{
  FUN <- match.fun(FUN)
  answer <- mclapply(X = X, FUN = FUN, ...)
  if (USE.NAMES && is.character(X) && is.null(names(answer))) 
    names(answer) <- X
  if (!identical(simplify, FALSE) && length(answer)) 
    simplify2array(answer, higher = (simplify == "array"))
  else answer
}

cooccurLocal <- function (mat, type = "spp_site", thresh = TRUE, spp_names = FALSE, 
    true_rand_classifier = 0.1, prob = "hyper", only_effects = FALSE, 
    eff_standard = TRUE, eff_matrix = FALSE) 
{
    if (type == "spp_site") {
        spp_site_mat <- mat
    }
    if (type == "site_spp") {
        spp_site_mat <- t(mat)
    }
    if (spp_names == TRUE) {
        spp_key <- data.frame(num = 1:nrow(spp_site_mat), spp = row.names(spp_site_mat))
    }
    spp_site_mat[spp_site_mat > 0] <- 1
    nsite <- ncol(spp_site_mat)
    nspp <- nrow(spp_site_mat)
    spp_pairs <- choose(nspp, 2)
    incidence <- prob_occur <- matrix(nrow = nspp, ncol = 2)
    obs_cooccur <- prob_cooccur <- exp_cooccur <- matrix(nrow = spp_pairs, 
        ncol = 3)
    prob_share_site <- c(0:(nsite + 1))
    incidence <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat, 
        na.rm = T))
    prob_occur <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat, 
        na.rm = T)/nsite)
    pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)), 
        style = 3)
    row <- 0
    for (spp in 1:nspp) {
        if (spp < nspp) {
            for (spp_next in (spp + 1):nspp) {
                row <- row + 1
                pairs <- 0
                for (site in 1:nsite) {
                  if (spp_site_mat[spp, site] > 0 & spp_site_mat[spp_next, 
                    site] > 0) {
                    pairs <- pairs + 1
                  }
                }
                obs_cooccur[row, 1] <- spp
                obs_cooccur[row, 2] <- spp_next
                obs_cooccur[row, 3] <- pairs
                prob_cooccur[row, 1] <- spp
                prob_cooccur[row, 2] <- spp_next
                prob_cooccur[row, 3] <- prob_occur[spp, 2] * 
                  prob_occur[spp_next, 2]
                exp_cooccur[row, 1] <- spp
                exp_cooccur[row, 2] <- spp_next
                exp_cooccur[row, 3] <- prob_cooccur[row, 3] * 
                  nsite
            }
        }
        setTxtProgressBar(pb, spp)
    }
    if (thresh == TRUE) {
        n_pairs <- nrow(prob_cooccur)
        prob_cooccur <- prob_cooccur[exp_cooccur[, 3] >= 1, ]
        obs_cooccur <- obs_cooccur[exp_cooccur[, 3] >= 1, ]
        exp_cooccur <- exp_cooccur[exp_cooccur[, 3] >= 1, ]
        n_omitted <- n_pairs - nrow(prob_cooccur)
        pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)), 
            style = 3)
    }
    output <- data.frame(matrix(nrow = 0, ncol = 9))
    colnames(output) <- c("sp1", "sp2", "sp1_inc", "sp2_inc", 
        "obs_cooccur", "prob_cooccur", "exp_cooccur", "p_lt", 
        "p_gt")
    output <- mcsapply(1:nrow(obs_cooccur), function(row) {
        sp1 <- obs_cooccur[row, 1]
        sp2 <- obs_cooccur[row, 2]
        sp1_inc <- incidence[incidence[, 1] == sp1, 2]
        sp2_inc <- incidence[incidence[, 1] == sp2, 2]
        max_inc <- max(sp1_inc, sp2_inc)
        min_inc <- min(sp1_inc, sp2_inc)
        prob_share_site <- rep(0, (nsite + 1))
        if (prob == "hyper") {
            if (only_effects == FALSE) {
                all.probs <- phyper(0:min_inc, min_inc, nsite - 
                  min_inc, max_inc)
                prob_share_site[1] <- all.probs[1]
                for (j in 2:length(all.probs)) {
                  prob_share_site[j] <- all.probs[j] - all.probs[j - 
                    1]
                }
            }
            else {
                for (j in 0:nsite) {
                  if ((sp1_inc + sp2_inc) <= (nsite + j)) {
                    if (j <= min_inc) {
                      prob_share_site[(j + 1)] <- 1
                    }
                  }
                }
            }
        }
        if (prob == "comb") {
            if (only_effects == FALSE) {
                for (j in 0:nsite) {
                  if ((sp1_inc + sp2_inc) <= (nsite + j)) {
                    if (j <= min_inc) {
                      prob_share_site[(j + 1)] <- coprob(max_inc = max_inc, 
                        j = j, min_inc = min_inc, nsite = nsite)
                    }
                  }
                }
            }
            else {
                for (j in 0:nsite) {
                  if ((sp1_inc + sp2_inc) <= (nsite + j)) {
                    if (j <= min_inc) {
                      prob_share_site[(j + 1)] <- 1
                    }
                  }
                }
            }
        }
        p_lt <- 0
        p_gt <- 0
        for (j in 0:nsite) {
            if (j <= obs_cooccur[row, 3]) {
                p_lt <- prob_share_site[(j + 1)] + p_lt
            }
            if (j >= obs_cooccur[row, 3]) {
                p_gt <- prob_share_site[(j + 1)] + p_gt
            }
            if (j == obs_cooccur[row, 3]) {
                p_exactly_obs <- prob_share_site[(j + 1)]
            }
        }
        p_lt <- round(p_lt, 5)
        p_gt <- round(p_gt, 5)
        p_exactly_obs <- round(p_exactly_obs, 5)
        prob_cooccur[row, 3] <- round(prob_cooccur[row, 3], 3)
        exp_cooccur[row, 3] <- round(exp_cooccur[row, 3], 1)
        output[row, ] <- c(sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row, 
            3], prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt, 
            p_gt)
        return(output)
    }, simplify=FALSE)
    output <- do.call("rbind", output)
    output <- output[!is.na(output$sp1),]
    close(pb)
    if (spp_names == TRUE) {
        sp1_name <- merge(x = data.frame(order = 1:length(output$sp1), 
            sp1 = output$sp1), y = spp_key, by.x = "sp1", by.y = "num", 
            all.x = T, sort = FALSE)
        sp2_name <- merge(x = data.frame(order = 1:length(output$sp2), 
            sp2 = output$sp2), y = spp_key, by.x = "sp2", by.y = "num", 
            all.x = T, sort = FALSE)
        output$sp1_name <- sp1_name[with(sp1_name, order(order)), 
            "spp"]
        output$sp2_name <- sp2_name[with(sp2_name, order(order)), 
            "spp"]
    }
    true_rand <- (nrow(output[(output$p_gt >= 0.05 & output$p_lt >= 
        0.05) & (abs(output$obs_cooccur - output$exp_cooccur) <= 
        (nsite * true_rand_classifier)), ]))
    output_list <- list(call = match.call(), results = output, 
        positive = nrow(output[output$p_gt < 0.05, ]), negative = nrow(output[output$p_lt < 
            0.05, ]), co_occurrences = (nrow(output[output$p_gt < 
            0.05 | output$p_lt < 0.05, ])), pairs = nrow(output), 
        random = true_rand, unclassifiable = nrow(output) - (true_rand + 
            nrow(output[output$p_gt < 0.05, ]) + nrow(output[output$p_lt < 
            0.05, ])), sites = nsite, species = nspp, percent_sig = (((nrow(output[output$p_gt < 
            0.05 | output$p_lt < 0.05, ])))/(nrow(output))) * 
            100, true_rand_classifier = true_rand_classifier)
    if (spp_names == TRUE) {
        output_list$spp_key <- spp_key
        output_list$spp.names = row.names(spp_site_mat)
    }
    else {
        output_list$spp.names = c(1:nrow(spp_site_mat))
    }
    if (thresh == TRUE) {
        output_list$omitted <- n_omitted
        output_list$pot_pairs <- n_pairs
    }
    class(output_list) <- "cooccur"
    if (only_effects == F) {
        output_list
    }
    else {
        effect.sizes(mod = output_list, standardized = eff_standard, 
            matrix = eff_matrix)
    }
}