
时间:2018-12-22 22:49:30

标签: r dataframe bioinformatics


我的起始数据框如下所示: 每行都是遗传标记。前两列提供了该标记的位置信息,随后几列提供了该特定标记处的个体的DNA核苷酸信息。



因此,在此数据框中,该行上有5个遗传标记,共有3个个体。 (个人1在V1和V2中都有两个核苷酸,个人2在V3和V4中都有它们,依此类推)。

group pos V1 V2 V3 V4 V5 V6 
1     10  A  A  G  G  T  T
2     11  C  C  G  G  A  A
3     12  T  T  T  A  C  G
4     13  0  0  0  A  C  G
5     14  G  T  0  0  C  A    



A A C C T T 0 0 G T 
G G G G T A 0 A 0 0 
T T A A C G C G C A 


oi2 <- list(NA) # create an empty list assigned to "oi2"
for(j in seq(3, ncol(data), 2)) { # create a sequence of data subset to keep 2 columns together 
oi <- "" # create an empty vector 
  for(i in 1:nrow(data)) { # do it for every row 
    oi <- c(oi, as.character(data[i,j]), as.character(data[i,j+1])) # add data together in a row 
  } # loop ends for row loop, were still inside first loop 
 oi <- oi[-1] # remove first "" element 
  oi2[[j-2]] <- oi # once oi is created, save to list "oi2", assigned to j-2 position in list 
} # loop closes 
oi3 <- oi2[!sapply(oi2, is.null)] # remove null elements in data frame 
# unlist the list and then convert to matrix, and then to data frame 
df <- data.frame(matrix(unlist(oi2), nrow=length(oi3), byrow=T, 
                          ncol = length(oi3[[1]]))) 


2 个答案:

答案 0 :(得分:0)



library( data.table )
dt <- fread("group pos V1 V2 V3 V4 V5 V6 
1     10  A  A  G  G  T  T
2     11  C  C  G  G  A  A
3     12  T  T  T  A  C  G
4     13  0  0  0  A  C  G
5     14  G  T  0  0  C  A", header = TRUE, stringsAsFactors = FALSE)


library( tidyverse )
#paste together the rows of the dt (minus col 1 and 2)
l1 <- pmap( dt[, -c(1,2)], paste, sep = '')
#split the values in the list into pairs of 2 letters
l2 <- lapply( l1, strsplit, "(?<=.{2})", perl = TRUE )
data <- unlist(l2)
#build a new matrix with three rows
matrix( data, nrow = 3) %>% apply( ., 1, paste, collapse = "")



答案 1 :(得分:0)


m <- as.matrix(DF[-(1:2)]
nr <- nrow(m) # 5
nc <- ncol(m) # 6

matrix(aperm(array(m, c(nr, 2, nc/2)), c(3, 2, 1)), nc/2)


     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "A"  "A"  "C"  "C"  "T"  "T"  "0"  "0"  "G"  "T"  
[2,] "G"  "G"  "G"  "G"  "T"  "A"  "0"  "A"  "0"  "0"  
[3,] "T"  "T"  "A"  "A"  "C"  "G"  "C"  "G"  "C"  "A" 

2)上面的一种变化是首先转置m,将其整形为数组,然后将刚开始的两个维重新排列为最后的矩阵。 / p>

matrix(aperm(array(t(m), c(2, nc/2, nr)), c(2, 1, 3)), nc/2)

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "A"  "A"  "C"  "C"  "T"  "T"  "0"  "0"  "G"  "T"  
[2,] "G"  "G"  "G"  "G"  "T"  "A"  "0"  "A"  "0"  "0"  
[3,] "T"  "T"  "A"  "A"  "C"  "G"  "C"  "G"  "C"  "A"  


Lines <- "
group pos V1 V2 V3 V4 V5 V6 
1     10  A  A  G  G  T  T
2     11  C  C  G  G  A  A
3     12  T  T  T  A  C  G
4     13  0  0  0  A  C  G
5     14  G  T  0  0  C  A"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)