Question

我有这样的数据集：

   FBti0018875  2031    2045    -   TTCCAGAAACTGTTG hsf 62  0.9763443383736672
    FBti0018875 2038    2052    +   TTCTGGAATGGCAAC hsf 61  0.96581136371138
    FBti0018925 2628    2642    -   GACTGGAACTTTTCA hsf 60  0.9532992561656318
    FBti0018925 2828    2842    +   AGCTGGAAGCTTCTT hsf 63  0.9657036377575696
    FBti0018949 184 198 -   TTCGAGAAACTTAAC hsf 61  0.965931072979605
    FBti0018986 2036    2050    +   TTCTAGCATATTCTT hsf 63  0.9943559469645551
    FBti0018993 1207    1221    -   TTCTAGCATATTCTT hsf 63  0.9943559469645575
    FBti0018996 2039    2053    +   TTCTAGCATATTCTT hsf 63  0.9943559469645575
    FBti0019092 2985    2999    -   TTCTAGCATATTCTT hsf 63  0.9943559469645409
    FBti0019118 1257    1271    +   TTCCAGAATCTTGGA hsf 60  0.9523907773798134

第一列是标识符，第二列和第三列是坐标。我只想为每个坐标范围保留一行。这意味着如果存在重叠，我想保留最佳标识符（最好根据最后一列定义，更高值=更好）。

例如，对于标识符FBti0018875，我会保留第一个，因为a）与第二行重叠，b）其最后一列值更高（0.97> 0.96）。

如果第一行和第二行之间没有重叠，我会保留两者。有时我可以为每个标识符分别设置5行或6行，因此它不像将当前标识符与前一个标识符进行比较那么简单。

到目前为止，我有这段代码无法使用。

def test(lista, listb): #Compare lists of coordinates
    a = 0
    b = 0
    found = False
    while a<len(lista) and b<len(listb):
        result = check( lista[a] , listb[b] )
        if result < 0:
            a += 1
            continue
        if result > 0:
            b += 1
            continue
        # we found overlapping intervals
        found = True
        return (found, a, lista[a], b, listb[b] )
    return found

def check( (astart, aend) , (bstart, bend) ):
    if aend < bstart:
        return -1
    if bend < astart:
        return 1
    return 0

refine = open("tffm/tffm_te_hits95.txt", "r")
refined = open("tffm/tffm_te_unique_hits95.txt", "w")
current_TE=[]
for hit in refine:
    info=hit.rstrip().split('\t')
    if len(current_TE)==0 or info[0]==current_TE[0][0]:
        current_TE.append(info)
    elif info[0]!=current_TE[0][0]:
        to_keep=[]
        i=0
        if len(current_TE)==1:
            to_keep.append(0)
        else:
            for i in range(len(current_TE)-1):
                if [current_TE[i][1], current_TE[i][2]] == [current_TE[i+1][1], current_TE[i+1][2]]:
                    if current_TE[i][7]<current_TE[i+1][7]:
                        to_keep.append(i+1)
                elif test([(current_TE[i][1], current_TE[i][2])], [(current_TE[i+1][1], current_TE[i+1][2])])!='False':
                    if current_TE[i][7]<current_TE[i+1][7]:
                        to_keep.append(i+1)
                        try:
                            to_keep.remove(i)
                        except:
                            pass
                    else:
                        to_keep.append(i)
            else:
                to_keep.append(i)
                if i==len(current_TE)-1:
                    to_keep.append(i+1)
    for item in set(to_keep):
        print current_TE[item]
        current_TE=[]

这种情况下的预期结果是（只丢失一个FBti0018875）

FBti0018875  2031    2045    -   TTCCAGAAACTGTTG hsf 62  0.9763443383736672
FBti0018925 2628    2642    -   GACTGGAACTTTTCA hsf 60  0.9532992561656318
FBti0018925 2828    2842    +   AGCTGGAAGCTTCTT hsf 63  0.9657036377575696
FBti0018949 184 198 -   TTCGAGAAACTTAAC hsf 61  0.965931072979605
FBti0018986 2036    2050    +   TTCTAGCATATTCTT hsf 63  0.9943559469645551
FBti0018993 1207    1221    -   TTCTAGCATATTCTT hsf 63  0.9943559469645575
FBti0018996 2039    2053    +   TTCTAGCATATTCTT hsf 63  0.9943559469645575
FBti0019092 2985    2999    -   TTCTAGCATATTCTT hsf 63  0.9943559469645409
FBti0019118 1257    1271    +   TTCCAGAATCTTGGA hsf 60  0.9523907773798134

我已经尝试（使用代码）生成一个包含多个具有相同标识符的行的列表，然后为具有重叠坐标的行解析它，如果是这种情况，则根据最后一列选择一个。它成功地检查了重叠，但我只在其某些版本中检索了一些行或：

Traceback (most recent call last):
  File "<stdin>", line 29, in <module>
IndexError: list index out of range

Answer 1

最后我解决了。有一个愚蠢的错误＆＃39; False＆＃39;而不是假。

以下是解决方案：

def test(lista, listb):
    a = 0
    b = 0
    found = False
    while a<len(lista) and b<len(listb):
        result = check( lista[a] , listb[b] )
        if result < 0:
            a += 1
            continue
        if result > 0:
            b += 1
            continue
        # we found overlapping intervals
        found = True
        return (found, a, lista[a], b, listb[b] )
    return found

def check( (astart, aend) , (bstart, bend) ):
    if aend < bstart:
        return -1
    if bend < astart:
        return 1
    return 0

def get_unique_sre(current_TE):
    to_keep = range(0,len(current_TE))
    for i in range(len(current_TE)-1):
        if [current_TE[i][1], current_TE[i][2]] == [current_TE[i+1][1], current_TE[i+1][2]]:
            if current_TE[i][7]<current_TE[i+1][7]:
                try:
                    to_keep.remove(i)
                except:
                    pass
        elif test([(current_TE[i][1], current_TE[i][2])], [(current_TE[i+1][1], current_TE[i+1][2])])!=False:
            if current_TE[i][7]<current_TE[i+1][7]:
                try:
                    to_keep.remove(i)
                except:
                    pass
            else:
                to_keep.remove(i+1)
    final_TE=[]
    for i in to_keep:
        final_TE.append(current_TE[i])
    return final_TE


refine = open("tffm/tffm_te_hits95.txt", "r")
refined = open("tffm/tffm_te_unique_hits95.txt", "w")
current_TE=[]
for hit in refine:
    info=hit.rstrip().split('\t')
    if len(current_TE)==0 or info[0]==current_TE[0][0]:
        current_TE.append(info)
    else:
        if len(current_TE)==1:
            print>>refined, current_TE[0]
            current_TE=[]
        else:
            final_TE = get_unique_sre(current_TE)
            for item in final_TE:
                print>>refined, item
                current_TE=[]
refined.close()

坐标重叠时得到最佳线

1 个答案: