
时间:2013-11-21 17:21:34

标签: python random bioinformatics


Chromosome    Start    End  Length    Number  
chr1     100    400    300    6...


Chromosome    Site    
chr1    105
chr1    110...

我想根据第一个数据集选择随机条目。 因此,对于第一个文件中的每个区域,我将从第二个数据集中获得一个具有相同长度和数量但随机位置的随机区域。


Chromosome    Start    End  Length    Number  
chr5     350    650    300    6...


List = []
NewList = []
LineCount = 0

for Line in FileB:
    if LineCount == 0:
    LineCount +=1

for Line in FileA:
    Chr, Start, End, Len, Entries = Line.strip("\n").split("\t")[:5]
    RandomStart = random.sample(List, 1)
    ## here I need to find a way to keep adding sequential lines to a NewList till the last site minus the first site is near the Len
    ## then I need to convert this new list into the format Chr, Start, End, Lenght, Number and write out and then clear NewList

2 个答案:

答案 0 :(得分:1)


fileA = open(pathToFileA).read()
fileB = open(pathToFileB).read()

out = open("foundMatches.tsv", "w")

splitA = FileA.split("\n")
splitB = FileB.split("\n")

For genomicRegion in SplitA:
    splitRegionsA = genomicRegions.split("\t")
    chromosomeA = splitRegionsA[0]
    startA = splitRegionsA[1]
    endA = splitRegionsA[2]
    lengthA = splitRegionsA[3]
    numberA = splitRegionsA[4]

    for genomicRegionB in SplitB:
        splitRegionsB = genomicRegionsB.split("\t")
        chromosomeB = splitRegionsB[0]
        startB = splitRegionsB[1]
        endB = splitRegionsB[2]
        lengthB = splitRegionsB[3]
        numberB = splitRegionsB[4]

        if lengthA == lengthB:
            if numberA == numberB:
                out.write(str(chromosomeA) + "\t" + str(startA) + "\t" + str(endA) + "\t" + str(lengthA) + "\t" + str(numberA) + "\t" + str(chromosomeB) + "\t" + str(startB) + "\t" + str(endB) + "\t" + str(lengthB) + "\t" + str(numberB) + "\n")

然后你可以从out文件中选择随机样本。 (如果你的数据集很大,你会想要更优雅的东西。)

答案 1 :(得分:0)


import random

def get_regions(i, Chr, Start, End, Len):
    n = EndN = 0
    while 0 < (End - Start) <= int(Len)+15:
        End = int(Dict[i+1].split("\t")[2])
        EndN = int(Dict[i].split("\t")[2])
        i +=1
        n +=1
    if int(Len)-15 <= (EndN - Start) <= int(Len)+15:
        OutFile.write(Chr + '\t'+str(Start)+ '\t'+str(EndN) +'\t'+ str(n) +'\t'+str(int(EndN)-int(Start))+ '\n')
        NewList =[]
        Chr, Start, End, i = get_random(Keys)

def get_random(Keys):
    i = random.sample(Keys, 1)[0]
    Chr = Dict[i].split('\t')[0]
    Start = int(Dict[i].split('\t')[1])
    End = int(Dict[i+10].split('\t')[2])
    get_regions(i, Chr, Start, End, Len)
    return Chr, Start, End, i

InFile = open(FileB, 'r')
OutFile = open(OutFile, 'w')
Dict = {}
LineCount = 0

for Line in InFile:
    if LineCount > 0:
        Dict[LineCount-1] = Line
    LineCount +=1

LineCount = 0
DiffFile = open(FileA, "r")
for Line in DiffFile:
    if LineCount ==0:
        Header = Line
        Entries, Len = Line.strip("\n").split("\t")[3:5]
        Keys = Dict.keys()
        Chr, Start, End, i = get_random(Keys)
    LineCount +=1