用于在R

时间:2017-09-01 18:34:34

标签: r performance matrix genome

您好我想将编码为三元组的基因型矩阵转换为编码为0,1,2的矩阵,即

c(0,0,1) <-> 0; c(0,1,0) <-> 1; c(0,0,1) <-> 2

首先是一些生成需要减少的矩阵的代码。

# generate genotypes
expand.G = function(n,p){
  probs = runif(n = p)
  G012.rows = matrix(rbinom(2,prob = probs,n=n*p),nrow = p)
  colnames(G012.rows) = paste('s',1:n,sep = '')
  rownames(G012.rows) = paste('g',1:p, sep = '')
  G012.cols = t(G012.rows)

  expand.geno = function(g){
    if(g == 0){return(c(1,0,0))}
    if(g == 1){return(c(0,1,0))}
    if(g == 2){return(c(0,0,1))}
  }

  gtype = c()
  for(i in 1:length(c(G012.cols))){
    gtype = c(
      gtype,
      expand.geno(c(G012.cols)[i])
    )
  }

  length(gtype)

  G = matrix(gtype,byrow = T, nrow = p)
  colnames(G) = paste('s',rep(1:n,each = 3),c('1','2','3'),sep = '')
  rownames(G) = paste('g',1:p, sep = '')
  print(G[1:10,1:15])
  print(G012.rows[1:10,1:5])

  return(G)
}

输出具有3n列和p行,其中n是样本大小,p是基因型的数量。现在我们可以使用以下函数将矩阵缩减回0,1,2编码

reduce012 = function(x){
  if(identical(x, c(1,0,0))){
    return(0)
  } else if(identical(x, c(0,1,0))){
    return(1)
  } else if(identical(x,  c(0,0,1))){
    return(2)
  } else { 
    return(NA)
  }
}

reduce.G = function(G.gen){
  G.vec = 
    mapply(function(i,j) reduce012(as.numeric(G.gen[i,(3*j-2):(3*j)])), 
           i=expand.grid(1:(ncol(G.gen)/3),1:nrow(G.gen))[,2], 
           j=expand.grid(1:(ncol(G.gen)/3),1:nrow(G.gen))[,1]
    )

  G = matrix(G.vec, nrow = ncol(G.gen)/3, ncol = nrow(G.gen))
  colnames(G) = rownames(G.gen)
  return(G)
}

reduce.G.loop = function(G.gen){
  G = matrix(NA,nrow = ncol(G.gen)/3, ncol = nrow(G.gen))
  for(i in 1:nrow(G.gen)){
    for(j in 1:(ncol(G.gen)/3)){
      G[j,i] = reduce012(as.numeric(G.gen[i,(3*j-2):(3*j)]))
    }
  }
  colnames(G) = rownames(G.gen)
  return(G)
}

输出是n行乘p列。编码为0,1,2的矩阵是偶然但有意的,是编码为三元组的矩阵的转置。

代码不是特别快。困扰我的是时间与n ^ 2相关。你能解释或提供更有效的代码吗?

G = expand.G(1000,20)
system.time(reduce.G(G))
system.time(reduce.G.loop(G))

G = expand.G(2000,20)
system.time(reduce.G(G))
system.time(reduce.G.loop(G))

G = expand.G(4000,20)
system.time(reduce.G(G))
system.time(reduce.G.loop(G))

2 个答案:

答案 0 :(得分:2)

您只需创建一个访问者查找表:

decode <- array(dim = c(3, 3, 3))
decode[cbind(1, 0, 0) + 1] <- 0
decode[cbind(0, 1, 0) + 1] <- 1
decode[cbind(0, 0, 1) + 1] <- 2

然后,就这样做:

matrix(decode[matrix(t(G + 1), ncol = 3, byrow = TRUE)], ncol = nrow(G))

这个完整的矢量化R版本将为您提供相同的矩阵,没有dimnames和超快速。

然而,如果你有更大的矩阵,你应该真正使用Rcpp来解决内存和时序问题。

答案 1 :(得分:1)

这似乎比您的版本(重命名为reduce.G.orig)快三倍:

reduce.G <- function(G) {
  varmap = c("100"=0, "010"=1, "001"=2)
  result <- do.call(rbind, lapply(1:(ncol(G)/3)-1, function(val) 
    varmap[paste(G[,3*val+1], G[,3*val+2], G[,3*val+3], sep="")]))
  colnames(result) <- rownames(G)
  result
}

system.time(reduce.G(G))
#   user  system elapsed 
#  0.156   0.000   0.155 

system.time(reduce.G.orig(G))
#   user  system elapsed 
#  0.444   0.000   0.441 

identical(reduce.G(G), reduce.G.orig(G))
# [1] TRUE