在R中使用apply(或其他迭代函数)创建向量矩阵

时间:2016-11-05 20:16:17

标签: r matrix encoding lapply orthogonal

我需要在一组八聚体(8个字母的集合)上运行正交编码函数,并将它们作为nx160数字的矩阵返回(其中n是数据上的八聚体数)。

正交编码功能是:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

有些人问过,这是一个例子,即使这不是不起作用的部分:

  orthocode <- function(octamer){
        matcode <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
        octamer_char <- as.character(octamer)
        octamer_split <- strsplit(octamer_char,"")[[1]]
        for (letter in octamer_split){
           ifelse (letter == "A", (matcode = rbind(matcode,c(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "R", (matcode = rbind(matcode,c(0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "N", (matcode = rbind(matcode,c(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "D", (matcode = rbind(matcode,c(0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "C", (matcode = rbind(matcode,c(0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "Q", (matcode = rbind(matcode,c(0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "E", (matcode = rbind(matcode,c(0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "G", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "H", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "I", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "L", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0))),
           ifelse (letter == "K", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0))),
           ifelse (letter == "M", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0))),
           ifelse (letter == "F", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0))),
           ifelse (letter == "P", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0))),
           ifelse (letter == "S", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0))),
           ifelse (letter == "T", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0))),
           ifelse (letter == "W", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0))),
           ifelse (letter == "Y", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0))),
           ifelse (letter == "V", (matcode = rbind(matcode,c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1)))
           ))))))))))))))))))))
       }
       matcode <- matcode[-1,]
       matcode <- c(matcode)
       return(matcode)
    }

该函数正在处理单个八度音,但当我尝试使用lapply时,结果只是一个160数字向量,这次代码改变了(并且没有意义)。

 orthocode("ARNDCQEG")
 [1] 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [81] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

结果如下:

lapply(data[1], orthocode)

正交码功能实际上正在运行。我需要知道的是如何从数据帧中获取八聚体,对它们进行操作,最终得到一个类似于此的矩阵:

$V1
[1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[81] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

输出必须是n行,160列的矩阵。在我必须运行的数据上,结果矩阵应该是947x160。

有什么想法吗?

3 个答案:

答案 0 :(得分:2)

switch具有CASE构造的语义,存在于其他语言中。在没有好的例子的情况下进行轻微测试,但试试这个:

orthocode <- function(octamer){
    matcode <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
    octamer_char <- as.character(octamer)
    octamer_split <- strsplit(octamer_char,"")[[1]]
    for (letter in octamer_split){ 
        val <- switch( letter,  
       "A" = c(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "R" = c(0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "N" = c(0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "D" = c(0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "C" = c(0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "Q" = c(0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "E" = c(0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0),
       "G" = c(0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0),
       "H" = c(0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0),
       "I" = c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0),
       "L" = c(0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
       "K" = c(0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0),
       "M" = c(0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0),
       "F" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0),
       "P" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0),
       "S" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0),
       "T" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0),
       "W" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0),
       "Y" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0),
       "V" = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1)
                 )
      matcode=c(matcode,val)
     }
   matcode 
}

请注意,我删除了matcode <- c(matcode)的行,因为它具有破坏矩阵结构的副作用。使用此:

dat <- list("ARNDE", "CQEGD")

我明白了:

     t( sapply(dat, orthocode) )
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
[1,]    0    0    0    0    0    0    0    0    0     0     0     0     0     0     0     0     0
[2,]    0    0    0    0    0    0    0    0    0     0     0     0     0     0     0     0     0
     [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
[1,]     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0
[2,]     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0
     [,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47]
[1,]     0     0     0     0     0     0     0     0     0     1     0     0     0     0     0
[2,]     0     0     0     0     0     0     0     0     0     0     0     0     0     1     0
     [,48] [,49] [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
[1,]     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
[2,]     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
     [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74] [,75] [,76] [,77]
[1,]     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0
[2,]     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0
     [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86] [,87] [,88] [,89] [,90] [,91] [,92]
[1,]     0     0     0     0     0     0     1     0     0     0     0     0     0     0     0
[2,]     0     0     0     0     0     0     0     0     0     0     1     0     0     0     0
     [,93] [,94] [,95] [,96] [,97] [,98] [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106]
[1,]     0     0     0     0     0     0     0      0      0      0      0      0      0      0
[2,]     0     0     0     0     0     0     0      0      0      0      0      1      0      0
     [,107] [,108] [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117] [,118] [,119]
[1,]      1      0      0      0      0      0      0      0      0      0      0      0      0
[2,]      0      0      0      0      0      0      0      0      0      0      0      0      0
     [,120]
[1,]      0
[2,]      0

如果我最后使用它,我更喜欢结果(但它不是你所说的):

    matcode <- matcode[-1, ,drop=FALSE]
    rownames(matcode) <- octamer_split
    return(matcode)  # here the return call is needed.

答案 1 :(得分:2)

我们可以使用ifelse简化match,然后删除forloop

orthocode <- function(octamer){
  matcode <- rep(0, 20)
  octamer_char <- as.character(octamer)
  octamer_split <- strsplit(octamer_char,"")[[1]]

  t(sapply(octamer_split, function(letter){
    res <- matcode
    res[ match(letter, c("A","R","N","D","C","Q","E","G","H","I",
                         "L","K","M","F","P","S","T","W","Y","V"))] <- 1
    res
  }))
}

答案 2 :(得分:2)

R是矢量化的。忘记为每个案例运行一个单独的代码块。不要在循环中生长物体。我只想跟

一起去
orthocode <- function(octamer) {

  # Predifine identity matrix
  m <- diag(20) 

  # Predefine values vector (no "J" or "B" here btw)
  rownames(m) <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L",
                   "K", "M", "F", "P", "S", "T", "W", "Y", "V") 

  # Create a character vector for each input
  octamer_split <- strsplit(as.character(octamer), "", fixed = TRUE)

  # match values for each value
  t(sapply(octamer_split, function(x) m[match(x, rownames(m)),])) 

}

此功能适用于单个输入或矢量。您可以使用

进行测试
orthocode(c("ARNDCQEG", "NGJKAEPS", "ABGSWKLA"))

或者仅使用

orthocode(data[, 1])

P.S。

你的矢量中没有JB,因此不确定如何处理你的例子。在这种情况下,它返回NA s