我正在循环中对遗传标记数据框进行'B等位基因频率'(BAF)计算。基于BAF概念背后的数学逻辑,我试图编写一个代码来执行它,但是,效率太低。
我的意见:
theta <- 'Probe sample1 sample2 sample3 sample4 sample5 AAm ABm BBm
AX-1 0.674 0.756 0.694 0.671 0.754 0.167 0.281 0.671
AX-2 0.117 0.907 0.501 0.904 0.548 0.116 0.506 0.903
AX-3 0.068 0.075 0.071 0.208 0.038 0.06 0.445 0.846'
theta <- read.table(text=theta, header=T)
我的剧本:
theta.split <- split(theta, 1:nrow(theta))
for(k in 1:length(theta.split)){
thetax <- as.data.frame(theta.split[[k]])
for(i in 2:(ncol(thetax)-3)){
if(as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$AAm))){
thetax[1,i] <- 0}
if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$AAm)) && as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$ABm))){
thex <- as.numeric(as.character(thetax[1,i]))
theAA <- as.numeric(as.character(thetax$AAm))
theAB <- as.numeric(as.character(thetax$ABm))
bafx <- ((0.5)*(thex - theAA))/(theAB - theAA)
thetax[1,i] <- bafx}
if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$ABm)) && as.numeric(as.character(thetax[1,i])) < as.numeric(as.character(thetax$BBm))){
thex <- as.numeric(as.character(thetax[1,i]))
theAB <- as.numeric(as.character(thetax$ABm))
theBB <- as.numeric(as.character(thetax$BBm))
bafx <- 0.5 + ((0.5)*(thex-theAB)/(theBB-theAB))
thetax[1,i] <- bafx}
if(as.numeric(as.character(thetax[1,i])) >= as.numeric(as.character(thetax$BBm))){
thetax[1,i] <- 1}
}
theta[k,] <- thetax
}
out <- theta
我的预期输出:
out <- 'Probe sample1 sample2 sample3 sample4 sample5 AAm ABm BBm
AX-1 1.000 1.000 1.000 1.000 1.000 0.167 0.281 0.671
AX-2 0.001 1.000 0.493 1.000 0.552 0.116 0.506 0.903
AX-3 0.010 0.019 0.014 0.192 0.000 0.06 0.445 0.846'
out <- read.table(text=out, header=T)
我很感激能让这段代码变得更聪明的任何想法。
答案 0 :(得分:1)
您可以利用应用和矢量化计算来避免循环。以下需要超过三分之一的时间:
library(dplyr)
#Take main code in your loops out as a function
#Using vectorised logical calcs instead of if statements
#sampleVec will be a vector and thetaDf will be the original theta dataframe
bafxFn <- function(sampleVec, thetaDf) {
testAAm <- sampleVec < thetaDf$AAm
sampleVec <- sampleVec * (1 - testAAm)
testAAmABm <- (sampleVec >= thetaDf$AAm) * (sampleVec < thetaDf$ABm)
bafx <- ((0.5) * (sampleVec - thetaDf$AAm)) / (thetaDf$ABm - thetaDf$AAm)
sampleVec <- testAAmABm * bafx + (1 - testAAmABm) * sampleVec
testABmBBm <- (sampleVec >= thetaDf$ABm) * (sampleVec < thetaDf$BBm)
bafx <- 0.5 + ((0.5) * (sampleVec - thetaDf$ABm)) / (thetaDf$BBm - thetaDf$ABm)
sampleVec <- testABmBBm * bafx + (1 - testABmBBm) * sampleVec
testBBm <- sampleVec >= thetaDf$BBm
sampleVec <- testBBm * 1 + (1 - testBBm) * sampleVec
sampleVec
}
#Subset original data frame to just leave the sample columns (using dplyr's select function)
sampleDf <-
theta %>% select(-Probe, -AAm, -ABm, -BBm)
#Use apply to loop through columns of remaining data
#passing columns in as vectors
outSampleDf <-
sampleDf %>%
apply(2, bafxFn, thetaDf = theta) %>%
as.data.frame()
#And then bind results back together (using dplyr's bind_cols)
outDf <-
bind_cols(
theta %>% select(Probe),
outSampleDf,
theta %>% select(AAm, ABm, BBm)
)
可能有一种更简洁的方法来处理某些子集,但是如果您有超过5个样本列,则尝试对其进行概括。
outDf
Source: local data frame [3 x 9]
Probe sample1 sample2 sample3 sample4 sample5 AAm ABm BBm
(fctr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
1 AX-1 1.000000000 1.00000000 1.00000000 1.0000000 1.0000000 0.167 0.281 0.671
2 AX-2 0.001282051 1.00000000 0.49358974 1.0000000 0.5528967 0.116 0.506 0.903
3 AX-3 0.010389610 0.01948052 0.01428571 0.1922078 0.0000000 0.060 0.445 0.846