Question

嗨，并且提前抱歉这篇长篇文章。

我来寻求帮助，因为我猜我的代码非常有效，甚至看起来效果不佳。

基本问题是与我们所拥有的程序相关联的问题，它收集液相色谱与质谱联用（LC-MS）的标记。我们有一个程序可以引导机器，记录数据并具有一些基本的分析功能。会发生的是，程序在每个记录的时间点收集几个质量，并且这些时间点的连续创建图表，因此一个质量的强度上升和随后的减少会产生峰值。程序应提取并随后整合峰值，给定操作员指定的参数。可以控制的参数之一是时间窗口和质量窗口，其中信号应该被认为属于相同的峰值而不是不同的峰值。那就是理论。

当检查数据时出现问题：许多信号实际上在指定的时间和质量窗口内。我想编写一个脚本来自动重新组合这些峰值并添加它们的强度。作为一个R写作的新手，我使用了大量的for循环和if语句，嵌套了所有语句，然后按小步骤进行操作，尽可能地解决问题。以下代码在随机样本的子集上工作得很好（750个变量而不是8000个变量）。对于多达500个样本，脚本运行得非常顺利。在750，已经有一个明显的思考时间。对于真实的数据集，它可以在一夜之间甚至一整天工作。

所以我们有几个列有关样本的信息（ID，xenobitic，被程序检测为饱和...），但我感兴趣的是第2列：保留时间，第3列：质量，然后第9列到第9列最后是每个样品的所述时间/质量对的积分值。我采用的方法是在所有保留时间和所有质量之间计算距离，确定质量和时间都落入定义窗口的样本。这个识别步骤需要大量的时间。然后形成落入同一窗口的信号组。我的意思是：如果A和B落入同一个窗口，B和C落入另一个窗口，那么我想要总结A和B和C.这也需要很长时间来计算。

然而，最后，当我用相同的方法（时间和质量落入预定义窗口的距离）验证结果数据时，我仍然得到一些命中。所以某些事情没有按预期发生。

以下是测试集：https://www.dropbox.com/s/d1yxxv1ketkz67g/Fs_BL-LC.txt?dl=0

这是我的代码（其中一些注释适用于我实验室中也将最终使用该脚本的其他人）：

#GET STARTED AND DEFINE VARIABLES
#Modify separator and decimal marker if you exported your file in a different format. Unskip the lines if you removed the MarkerLynx lines too.
metabo<-read.table(file.choose(), sep="\t", dec=".", skip="4", h=T)
winRT<-0.1
winMZ<-0.2
export.name<-"ramdom.export.name"
export.result<-"random.export.result.name"

#IDENTIFY REDUNDANCY IN SIGNALS
RT<-dist(metabo[,2])
MZ<-dist(metabo[,3])
dRT<-which(RT<winRT)
dMZ<-which(MZ<winMZ)
dRTdMZ<-intersect(dRT,dMZ)

index<-matrix(ncol=2)
for(n in dRTdMZ){
index<-rbind(index,which(as.matrix(RT) == RT[n] & lower.tri(as.matrix(RT)) & as.matrix(MZ) == MZ[n] & lower.tri(as.matrix(MZ)), arr.ind=T)) }
index<-index[2:length(index[,2]),]
deltaRT<-abs(metabo[index[,1],2]-metabo[index[,2],2])
deltaMZ<-abs(metabo[index[,1],3]-metabo[index[,2],3])
result<-cbind(metabo[index[,1],2:3],metabo[index[,2],2:3], deltaRT, deltaMZ)
result<-result[order(result[,1]),]
write.table(result, paste(export.result,".txt", collapse=""), sep="\t", dec=".", row.names=F)

#ISOLATE NON-REDUDANT SIGNALS
metabo2<-metabo
for(n in 1:length(result[,2])){
metabo2<-metabo2[!((metabo2[,2]==result[n,1] & metabo2[,3]==result[n,2])|(metabo2[,2]==result[n,3] & metabo2[,3]==result[n,4])),] }

#VERIFICATION (not needed, un-comment just to be sure)
#dRT2<-dist(metabo2[,2])
#dMZ2<-dist(metabo2[,3])
#wdRT2<-which(dRT2<winRT)
#wdMZ2<-which(dMZ2<winMZ)
#dRT2dMZ2<-intersect(wdRT2,wdMZ2)

#ADD SIGNALS WHEN REDUNDANT
#Adds integration values, takes mean of RT and MZ, only highest significance carried over. ID, Incl., Marker, Xeno and Satur. have 1 indication if were identical, or both separated by a slash if were different 

#Group in list of lists elements
temp.result2<-list()
for(n in 1:length(result[,1])){
    temp.result3<-c()
    for(m in 1:length(result[,1])){
        if(identical(result[n,1],result[m,1]) & identical(result[n,2],result[m,2])){
            temp.result3<-rbind(temp.result3, result[n,1:2], result[n,3:4], result[m,1:2], result[n,3:4])
        } else {}
    }
    for(k in 1:length(result[,1])){
        if(identical(result[n,1],result[k,3]) & identical(result[n,2],result[k,4])){
            temp.result3<-rbind(temp.result3, result[n,1:2], result[n,3:4], result[k,1:2], result[k,3:4])
        } else {}
    }
    temp.result2[[n]]<-temp.result3
}
for(n in 1:length(result[,1])){
    temp.result3<-c()
    for(m in 1:1:length(result[,1])){
        if(identical(result[n,3],result[m,3]) & identical(result[n,4],result[m,4])){
            temp.result3<-rbind(temp.result3, result[m,1:2], result[m,3:4], result[n,1:2], result[n,3:4])
        } else {}
    }   
    temp.result2[[length(temp.result2)+1]]<-temp.result3
}
#Regroup sublists which share elements
for(n in 1:length(temp.result2)){
    temp.result2[[n]]<-unique(temp.result2[[n]][order(temp.result2[[n]][,1]),])
}
for(n in 1:length(temp.result2)){ for(m in 1:length(temp.result2)){
    if(length(intersect(temp.result2[[n]][,1], temp.result2[[m]][,1]))!=0 & temp.result2[[n]][1,2]-temp.result2[[m]][1,2]<3*winMZ){
        temp.result2[[n]]<-rbind(temp.result2[[n]],temp.result2[[m]])
    }   else{}
    temp.result2[[n]]<-unique(temp.result2[[n]][order(temp.result2[[n]][,1]),])
}
}
#Identify redundancy in sublists and changes doubles, then copy remaining ones to new list.
for(n in 1:length(temp.result2)){ for(m in 1:length(temp.result2)){
    if(m!=n){ 
        if(length(intersect(temp.result2[[n]][,1], temp.result2[[m]][,1]))==length(temp.result2[[n]][,1])){ 
            temp.result2[[m]]<-cbind("Remove","Remove") 
        } else {}   
    } else{}
}}
temp.result3<-list()
for(n in 1:length(temp.result2)){
    if(is.numeric(temp.result2[[n]][,1])==T){
        temp.result3[[length(temp.result3)+1]]<-temp.result2[[n]]
    } else {}
}

#Add integration values, takes mean of RT and MZ...
metabo3<-data.frame()
for(n in 1:length(temp.result3)){
    temp.ID<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.ID<-append(temp.ID, as.character(metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],1]))
    }
    if(length(unique(temp.ID))==1){ new.ID<-temp.ID[1] } else { new.ID<-paste(unique(temp.ID), collapse="/") }
    temp.RT<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.RT<-append(temp.RT, metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],2])
    }
    new.RT<-round(mean(temp.RT), digits=4)
    temp.MZ<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.MZ<-append(temp.MZ, metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],3])
    }
    new.MZ<-round(mean(temp.MZ), digits=4)
    temp.sign<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.sign<-append(temp.sign, metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],4])
    }
    new.sign<-max(temp.sign)
    temp.incl<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.incl<-append(temp.incl, as.character(metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],5]))
    }
    if(length(unique(temp.incl))==1){   new.incl<-temp.incl[1] } else { new.incl<-paste(unique(temp.incl), collapse="/") }
    temp.mark<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.mark<-append(temp.mark, as.character(metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],6]))
    }
    if(length(unique(temp.mark))==1){   new.mark<-temp.mark[1] } else { new.mark<-paste(unique(temp.mark), collapse="/") }
    temp.xeno<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.xeno<-append(temp.xeno, as.character(metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],7]))
    }
    if(length(unique(temp.xeno))==1){   new.xeno<-temp.xeno[1] } else { new.xeno<-paste(unique(temp.xeno), collapse="/") }
    temp.sat<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.sat<-append(temp.sat, as.character(metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],8]))
    }
    if(length(unique(temp.sat))==1){    new.sat<-temp.sat[1] } else { new.sat<-paste(unique(temp.sat), collapse="/") }
        temp.integ<-c()
    for(m in 1:length(temp.result3[[n]][,1])){
        temp.integ<-rbind(temp.integ, metabo[metabo[,2]==temp.result3[[n]][m,1] & metabo[,3]==temp.result3[[n]][m,2],9:length(metabo)])
    }
    new.integ<- colSums(temp.integ)
    metabo3<-rbind(metabo3,cbind(new.ID, new.RT, new.MZ, new.sign, new.incl, new.mark, new.xeno, new.sat, rbind(new.integ[1:length(new.integ)])))
}

#VERIFICATION (not needed, un-comment just to be sure)
#dRT3<-dist(metabo3[,2])
#dMZ3<-dist(metabo3[,3])
#wdRT3<-which(dRT3<winRT)
#wdMZ3<-which(dMZ3<winMZ)
#dRT3dMZ3<-intersect(wdRT3,wdMZ3)

#REGROUP ALL SETS
colnames(metabo2)<-colnames(metabo)
colnames(metabo3)<-colnames(metabo)
metabo.final<-rbind(metabo2, metabo3)
metabo.final<-metabo.final[order(metabo.final[,2]),]

#VERIFICATION (not needed, un-comment just to be sure)
#dRTf<-dist(metabo.final[,2])
#dMZf<-dist(metabo.final[,3])
#wdRTf<-which(dRTf<winRT)
#wdMZf<-which(dMZf<winMZ)
#dRTfdMZf<-intersect(wdRTf,wdMZf)

#EXPORT
write.table(metabo.final, paste(export.name,".txt", collapse=""), sep="\t", dec=".", row.names=F)

请在你的意见中表示友好：尽管可能效率低下，但我仍然在这方面付出了很多努力，只希望改进。

提前多多感谢！

R：如果2个变量落在定义的窗口中，则求和值

0 个答案: