我有一个矩阵“mat”,其中012个编码的SNP作为列,人们作为行。例如:
> mat<-matrix(c("0","1","0","1","2","0","1","1","2"),3,byrow=T)
> rownames(mat)<-c("ID1","ID2","ID3")
> colnames(mat)<-c("rs123","rs333","rs9000")
> mat
rs123 rs333 rs9000
ID1 "0" "1" "0"
ID2 "1" "2" "0"
ID3 "1" "1" "2"
在不同的矩阵“mat2”中,我在两列(即次要和主要等位基因)中具有相应的等位基因,并且SNP为行。
> mat2<-matrix(c("A","T","C","T","T","G"),3,byrow=T)
> rownames(mat2)<-c("rs123","rs333","rs9000")
> colnames(mat2)<-c("Allele_A","Allele_B")
> mat2
Allele_A Allele_B
rs123 "A" "T"
rs333 "C" "T"
rs9000 "T" "G"
现在我想将第一个矩阵中的012编码的SNP重新编码为两列:如果它们的代码为零,它们应该是两个新列中的各自的等位基因A,如果是一个则是A / B和B / B如果它是两个。在我的例子中,我希望得到以下内容:
> mat3<-matrix(c("A","C","T","A","T","T","A","T","T","T","T","T","A","C","G","T","T","G" ),3,byrow=T)
> rownames(mat3)<-c("ID1","ID2","ID3")
> colnames(mat3)<-c("rs123_1","rs333_1","rs9000_1","rs123_2","rs333_2","rs9000_2")
> mat3
rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2
ID1 "A" "C" "T" "A" "T" "T"
ID2 "A" "T" "T" "T" "T" "T"
ID3 "A" "C" "G" "T" "T" "G"
你可以帮助我实现这个目标吗?提前谢谢!
答案 0 :(得分:4)
library(reshape2)
library(data.table)
mat<-data.table(matrix(c("ID1","0","1","0","ID2","1","2","0","ID3","1","1","2"),3,byrow=T))
setnames(mat, c("ID","rs123","rs333","rs9000"))
mat2<-data.table(matrix(c("rs123","A","T","rs333","C","T","rs9000","T","G"),3,byrow=T))
setnames(mat2, c("rs","Allele_A","Allele_B"))
mat <- data.table(melt(mat, id.vars = 'ID'))
setnames(mat,'variable','rs')
mat <- merge(mat,mat2, by = 'rs', all.x = TRUE)
mat[,a1 := Allele_A]
mat[value == 2,a1 := Allele_B]
mat[,a2 := Allele_B]
mat[value == 0,a2 := Allele_A]
mat1 <- dcast(mat, ID~rs, value.var = 'a1')
mat2 <- dcast(mat, ID~rs, value.var = 'a2')
mat <- merge(mat1,mat2,by = 'ID', suffixes = c('_1','_2'))
输出
> mat
ID rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2
1 ID1 A C T A T T
2 ID2 A T T T T T
3 ID3 A C G T T G
答案 1 :(得分:2)
继承了另一个data.table解决方案
library(data.table)
DT.mat1 <- as.data.table(mat)
DT.mat2 <- as.data.table(mat2, keep.rownames=TRUE)
## Make sure all characters
DT.mat2[, names(DT.mat2) := lapply(.SD, as.character)]
# set key to the row names
setkey(DT.mat2, rn)
DT.mat1[, setNames(nm=paste(names(.SD), "_", rep(1:2, times=ncol(.SD))),
unlist(lapply(names(.SD), function(x) {
.sdx <- .SD[[x]]
DT.mat2[.(x)][, list(ifelse(.sdx==2, Allele_B, Allele_A),
ifelse(.sdx==0, Allele_B, Allele_A))]
}), recursive=FALSE, use.names=FALSE)
)]
rs123 _ 1 rs333 _ 2 rs9000 _ 1 rs123 _ 2 rs333 _ 1 rs9000 _ 2
1: A T C C T G
2: A A T C T G
3: A A C C G T
您可以使用microbenchmark
(来自microbenchmark软件包)对此进行计时。我不确定你的实际数据有多大,但我猜你的数据会增长,这会很快
答案 2 :(得分:1)
以下是使用stack
,switch
和unstack
的基本R方式:
s.mat <- stack(data.frame(mat))
left.allele.choice <- function(x) switch(x, "0"=1, "1"=1, "2"=2)
right.allele.choice <- function(x) switch(x, "0"=1, "1"=2, "2"=2)
get.allele.choice <- function(allele.choice) {
mapply(function(values, ind) mat2[ind, allele.choice(values)],
values=s.mat$values, ind=s.mat$ind)
}
left.side <- unstack(transform(s.mat,
values=get.allele.choice(left.allele.choice)))
right.side <- unstack(transform(s.mat,
values=get.allele.choice(right.allele.choice)))
result <- cbind(left.side, right.side)
colnames(result) <- make.unique(colnames(result))
# rs123 rs333 rs9000 rs123.1 rs333.1 rs9000.1
# 1 A C T A T T
# 2 A T T T T T
# 3 A C G T T G
答案 3 :(得分:0)
不如Codoremifa提出的那样漂亮,作为一种功能:
allele_count_to_genotype <- function(M_count, M_geno){
# assuming same order of SNPs in both matrices!
# generate empty target matrix with right col and row names
M_target <- matrix(ncol=ncol(M_count)*2, nrow=nrow(M_count))
rownames(M_target) <- rownames(M_count)
colnames(M_target) <- c(paste(colnames(M_count),"1",sep="_"),
paste(colnames(M_count),"2",sep="_"))
# fill the target matrix with counts
M_target[,1:ncol(M_count)] <- M_count
M_target[, (ncol(M_count)+1):ncol(M_target)] <- M_count
# substitute the counts
for(k in 1:ncol(M_count)){
M_target[,k] <- gsub("0", M_geno[k,1], M_target[,k])
M_target[,k] <- gsub("1", M_geno[k,1], M_target[,k])
M_target[,k] <- gsub("2", M_geno[k,2], M_target[,k])
M_target[,(ncol(M_count)+k)] <- gsub("0", M_geno[k,1], M_target[,(ncol(M_count)+k)])
M_target[,(ncol(M_count)+k)] <- gsub("1", M_geno[k,2], M_target[,(ncol(M_count)+k)])
M_target[,(ncol(M_count)+k)] <- gsub("2", M_geno[k,2], M_target[,(ncol(M_count)+k)])
}
return(M_target)
}
效果很好:
> allele_count_to_genotype(M_count=mat, M_geno=mat2)
rs123_1 rs333_1 rs9000_1 rs123_2 rs333_2 rs9000_2
ID1 "A" "C" "T" "A" "T" "T"
ID2 "A" "T" "T" "T" "T" "T"
ID3 "A" "C" "G" "T" "T" "G"