Question

我正在循环中对遗传标记数据框进行'B等位基因频率'（BAF）计算。基于BAF概念背后的数学逻辑，我试图编写一个代码来执行它，但是，效率太低。

我的意见：

theta <- 'Probe sample1 sample2 sample3 sample4 sample5   AAm   ABm   BBm
         AX-1   0.674   0.756   0.694   0.671   0.754   0.167   0.281 0.671
         AX-2   0.117   0.907   0.501   0.904   0.548   0.116   0.506 0.903
         AX-3   0.068   0.075   0.071   0.208   0.038   0.06    0.445 0.846'
theta <- read.table(text=theta, header=T)

我的剧本：

theta.split <- split(theta, 1:nrow(theta))

for(k in 1:length(theta.split)){
  thetax <- as.data.frame(theta.split[[k]])
  for(i in 2:(ncol(thetax)-3)){
     if(as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$AAm))){
    thetax[1,i] <- 0}

    if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$AAm)) && as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$ABm))){
      thex <- as.numeric(as.character(thetax[1,i]))
      theAA <- as.numeric(as.character(thetax$AAm))
      theAB <- as.numeric(as.character(thetax$ABm))
      bafx <- ((0.5)*(thex - theAA))/(theAB - theAA)
      thetax[1,i] <- bafx}

    if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$ABm)) && as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$BBm))){
      thex <- as.numeric(as.character(thetax[1,i]))
      theAB <- as.numeric(as.character(thetax$ABm))
      theBB <- as.numeric(as.character(thetax$BBm))
      bafx <- 0.5 + ((0.5)*(thex-theAB)/(theBB-theAB))
      thetax[1,i] <- bafx}

    if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$BBm))){
      thetax[1,i] <- 1}

  }
  theta[k,] <- thetax
}
out <- theta

我的预期输出：

out <- 'Probe  sample1 sample2  sample3  sample4     sample5     AAm       ABm   BBm
        AX-1    1.000   1.000   1.000   1.000   1.000  0.167  0.281 0.671
        AX-2    0.001   1.000   0.493   1.000   0.552  0.116  0.506 0.903
        AX-3    0.010   0.019   0.014   0.192   0.000  0.06   0.445 0.846'
out <- read.table(text=out, header=T)

我很感激能让这段代码变得更聪明的任何想法。

Answer 1

您可以利用应用和矢量化计算来避免循环。以下需要超过三分之一的时间：

library(dplyr)

#Take main code in your loops out as a function
#Using vectorised logical calcs instead of if statements
#sampleVec will be a vector and thetaDf will be the original theta dataframe
bafxFn <- function(sampleVec, thetaDf) {

  testAAm <- sampleVec < thetaDf$AAm
  sampleVec <- sampleVec * (1 - testAAm)

  testAAmABm <- (sampleVec >= thetaDf$AAm) * (sampleVec < thetaDf$ABm)
  bafx <- ((0.5) * (sampleVec - thetaDf$AAm)) / (thetaDf$ABm - thetaDf$AAm)
  sampleVec <- testAAmABm * bafx + (1 - testAAmABm) * sampleVec

  testABmBBm <- (sampleVec >= thetaDf$ABm) * (sampleVec < thetaDf$BBm)
  bafx <- 0.5 + ((0.5) * (sampleVec - thetaDf$ABm)) / (thetaDf$BBm - thetaDf$ABm)
  sampleVec <- testABmBBm * bafx + (1 - testABmBBm) * sampleVec

  testBBm <- sampleVec >= thetaDf$BBm
  sampleVec <- testBBm * 1 + (1 - testBBm) * sampleVec

  sampleVec
}

#Subset original data frame to just leave the sample columns (using dplyr's select function)
sampleDf <- 
  theta %>% select(-Probe, -AAm, -ABm, -BBm)

#Use apply to loop through columns of remaining data
#passing columns in as vectors
outSampleDf <- 
  sampleDf %>%
  apply(2, bafxFn, thetaDf = theta) %>%
  as.data.frame()

#And then bind results back together (using dplyr's bind_cols)
outDf <- 
  bind_cols(
    theta %>% select(Probe),
    outSampleDf,
    theta %>% select(AAm, ABm, BBm)
  )

可能有一种更简洁的方法来处理某些子集，但是如果您有超过5个样本列，则尝试对其进行概括。

outDf
Source: local data frame [3 x 9]

   Probe     sample1    sample2    sample3   sample4   sample5   AAm   ABm   BBm
  (fctr)       (dbl)      (dbl)      (dbl)     (dbl)     (dbl) (dbl) (dbl) (dbl)
1   AX-1 1.000000000 1.00000000 1.00000000 1.0000000 1.0000000 0.167 0.281 0.671
2   AX-2 0.001282051 1.00000000 0.49358974 1.0000000 0.5528967 0.116 0.506 0.903
3   AX-3 0.010389610 0.01948052 0.01428571 0.1922078 0.0000000 0.060 0.445 0.846

BAF计算循环的替代方案

1 个答案: