我有以下数据(小部分)名为" short2_pre_snp_tumor.txt"
rs987435 C G 1 1 1 0 2
rs345783 C G 0 0 1 0 0
rs955894 G T 1 1 2 2 1
rs6088791 A G 1 2 0 0 1
rs11180435 C T 1 0 1 1 1
rs17571465 A T 1 2 2 2 2
rs17011450 C T 2 2 2 2 2
rs6919430 A C 2 1 2 2 2
rs2342723 C T 0 2 0 0 0
rs11992567 C T 2 2 2 2 2
我需要使用Python获取PED and MAP文件,因为在大型数据集的情况下R非常慢。
我在R中有以下代码:
tm <- proc.time()
d<-read.table("short2_pre_snp_tumor.txt")
n<-nrow(d) #237196
nrs<-ncol(d)-3 #1116
dd<- data.frame(matrix(NA, nrow= ncol(d)-3, ncol=2*nrow(d)), stringsAsFactors=TRUE)
for (j in 1:nrs) {
for (i in 1:n) {
if (d[i, j+3]==0) {
dd[j, 2*i-1]<-as.character(d[i,2])
dd[j, 2*i]<-as.character(d[i,2])
} else if (d[i, j+3]==1) {
dd[j, 2*i-1]<-as.character(d[i,2])
dd[j, 2*i]<-as.character(d[i,3])
} else if (d[i, j+3]==2) {
dd[j, 2*i-1]<-as.character(d[i,3])
dd[j, 2*i]<-as.character(d[i,3])
}
}
}
ped6front<-data.frame(FID = 1: nrow(dd), IID= 1: nrow(dd), PID=0, MID=0, SEX= sample(1:2, nrow(dd), replace=T), PHENOTYPE=2)
BRCA_tumorfromR.ped <- cbind(ped6front,dd)
write.table(BRCA_tumorfromR.ped, “BRCA_tumor.ped”, append=FALSE, quote=FALSE, col.names=FALSE)
proc.time() #ptm
答案 0 :(得分:2)
这是使用R:
# raw data
myRaw <- read.table(text = "
rs987435 C G 1 1 1 0 2
rs345783 C G 0 0 1 0 0
rs955894 G T 1 1 2 2 1
rs6088791 A G 1 2 0 0 1
rs11180435 C T 1 0 1 1 1
rs17571465 A T 1 2 2 2 2
rs17011450 C T 2 2 2 2 2
rs6919430 A C 2 1 2 2 2
rs2342723 C T 0 2 0 0 0
rs11992567 C T 2 2 2 2 2")
nIndividuals <- ncol(myRaw) - 3
nSNPs <- nrow(myRaw)
# make map, easy
MAP <- data.frame(
CHR = 1,
SNP = myRaw$V1,
CM = 0,
BP = seq(nSNPs))
# get first 6 columns of PED, easy
PED6 <- data.frame(
FID = seq(nIndividuals),
IID = seq(nIndividuals),
FatherID = 0,
MotherID = 0,
Sex = 1,
Phenotype = 1)
# convert 0,1,2 to genotypes, a bit tricky
# make helper dataframe for matching alleles
myAlleles <- data.frame(
AA = paste(myRaw$V2, myRaw$V2),
AB = paste(myRaw$V2, myRaw$V3),
BB = paste(myRaw$V3, myRaw$V3))
# make index to match with alleles
PEDsnps <- myRaw[, 4:ncol(myRaw)] + 1
# convert
PEDsnpsAB <-
sapply(seq(nSNPs), function(snp)
sapply(PEDsnps[snp, ], function(ind) myAlleles[snp, ind]))
# column bind first 6 cols with genotypes
PED <- cbind(PED6, PEDsnpsAB)
#output PED and MAP
write.table(PED, "gwas.ped", quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t")
write.table(MAP, "gwas.map", quote = FALSE, col.names = FALSE, row.names = FALSE, sep = "\t")
# test plink
# plink --file gwas
# PLINK v1.90b3c 64-bit (2 Feb 2015) https://www.cog-genomics.org/plink2
# (C) 2005-2015 Shaun Purcell, Christopher Chang GNU General Public License v3
# Logging to plink.log.
# 258273 MB RAM detected; reserving 129136 MB for main workspace.
# .ped scan complete (for binary autoconversion).
# Performing single-pass .bed write (10 variants, 5 people).
# --file: plink.bed + plink.bim + plink.fam written.