我有这样的数据集:
FBti0018875 2031 2045 - TTCCAGAAACTGTTG hsf 62 0.9763443383736672
FBti0018875 2038 2052 + TTCTGGAATGGCAAC hsf 61 0.96581136371138
FBti0018925 2628 2642 - GACTGGAACTTTTCA hsf 60 0.9532992561656318
FBti0018925 2828 2842 + AGCTGGAAGCTTCTT hsf 63 0.9657036377575696
FBti0018949 184 198 - TTCGAGAAACTTAAC hsf 61 0.965931072979605
FBti0018986 2036 2050 + TTCTAGCATATTCTT hsf 63 0.9943559469645551
FBti0018993 1207 1221 - TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0018996 2039 2053 + TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0019092 2985 2999 - TTCTAGCATATTCTT hsf 63 0.9943559469645409
FBti0019118 1257 1271 + TTCCAGAATCTTGGA hsf 60 0.9523907773798134
第一列是标识符,第二列和第三列是坐标。我只想为每个坐标范围保留一行。这意味着如果存在重叠,我想保留最佳标识符(最好根据最后一列定义,更高值=更好)。
例如,对于标识符FBti0018875,我会保留第一个,因为a)与第二行重叠,b)其最后一列值更高(0.97> 0.96)。
如果第一行和第二行之间没有重叠,我会保留两者。有时我可以为每个标识符分别设置5行或6行,因此它不像将当前标识符与前一个标识符进行比较那么简单。
到目前为止,我有这段代码无法使用。
def test(lista, listb): #Compare lists of coordinates
a = 0
b = 0
found = False
while a<len(lista) and b<len(listb):
result = check( lista[a] , listb[b] )
if result < 0:
a += 1
continue
if result > 0:
b += 1
continue
# we found overlapping intervals
found = True
return (found, a, lista[a], b, listb[b] )
return found
def check( (astart, aend) , (bstart, bend) ):
if aend < bstart:
return -1
if bend < astart:
return 1
return 0
refine = open("tffm/tffm_te_hits95.txt", "r")
refined = open("tffm/tffm_te_unique_hits95.txt", "w")
current_TE=[]
for hit in refine:
info=hit.rstrip().split('\t')
if len(current_TE)==0 or info[0]==current_TE[0][0]:
current_TE.append(info)
elif info[0]!=current_TE[0][0]:
to_keep=[]
i=0
if len(current_TE)==1:
to_keep.append(0)
else:
for i in range(len(current_TE)-1):
if [current_TE[i][1], current_TE[i][2]] == [current_TE[i+1][1], current_TE[i+1][2]]:
if current_TE[i][7]<current_TE[i+1][7]:
to_keep.append(i+1)
elif test([(current_TE[i][1], current_TE[i][2])], [(current_TE[i+1][1], current_TE[i+1][2])])!='False':
if current_TE[i][7]<current_TE[i+1][7]:
to_keep.append(i+1)
try:
to_keep.remove(i)
except:
pass
else:
to_keep.append(i)
else:
to_keep.append(i)
if i==len(current_TE)-1:
to_keep.append(i+1)
for item in set(to_keep):
print current_TE[item]
current_TE=[]
这种情况下的预期结果是(只丢失一个FBti0018875)
FBti0018875 2031 2045 - TTCCAGAAACTGTTG hsf 62 0.9763443383736672
FBti0018925 2628 2642 - GACTGGAACTTTTCA hsf 60 0.9532992561656318
FBti0018925 2828 2842 + AGCTGGAAGCTTCTT hsf 63 0.9657036377575696
FBti0018949 184 198 - TTCGAGAAACTTAAC hsf 61 0.965931072979605
FBti0018986 2036 2050 + TTCTAGCATATTCTT hsf 63 0.9943559469645551
FBti0018993 1207 1221 - TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0018996 2039 2053 + TTCTAGCATATTCTT hsf 63 0.9943559469645575
FBti0019092 2985 2999 - TTCTAGCATATTCTT hsf 63 0.9943559469645409
FBti0019118 1257 1271 + TTCCAGAATCTTGGA hsf 60 0.9523907773798134
我已经尝试(使用代码)生成一个包含多个具有相同标识符的行的列表,然后为具有重叠坐标的行解析它,如果是这种情况,则根据最后一列选择一个。它成功地检查了重叠,但我只在其某些版本中检索了一些行或:
Traceback (most recent call last):
File "<stdin>", line 29, in <module>
IndexError: list index out of range
答案 0 :(得分:0)
最后我解决了。有一个愚蠢的错误&#39; False&#39;而不是假。
以下是解决方案:
def test(lista, listb):
a = 0
b = 0
found = False
while a<len(lista) and b<len(listb):
result = check( lista[a] , listb[b] )
if result < 0:
a += 1
continue
if result > 0:
b += 1
continue
# we found overlapping intervals
found = True
return (found, a, lista[a], b, listb[b] )
return found
def check( (astart, aend) , (bstart, bend) ):
if aend < bstart:
return -1
if bend < astart:
return 1
return 0
def get_unique_sre(current_TE):
to_keep = range(0,len(current_TE))
for i in range(len(current_TE)-1):
if [current_TE[i][1], current_TE[i][2]] == [current_TE[i+1][1], current_TE[i+1][2]]:
if current_TE[i][7]<current_TE[i+1][7]:
try:
to_keep.remove(i)
except:
pass
elif test([(current_TE[i][1], current_TE[i][2])], [(current_TE[i+1][1], current_TE[i+1][2])])!=False:
if current_TE[i][7]<current_TE[i+1][7]:
try:
to_keep.remove(i)
except:
pass
else:
to_keep.remove(i+1)
final_TE=[]
for i in to_keep:
final_TE.append(current_TE[i])
return final_TE
refine = open("tffm/tffm_te_hits95.txt", "r")
refined = open("tffm/tffm_te_unique_hits95.txt", "w")
current_TE=[]
for hit in refine:
info=hit.rstrip().split('\t')
if len(current_TE)==0 or info[0]==current_TE[0][0]:
current_TE.append(info)
else:
if len(current_TE)==1:
print>>refined, current_TE[0]
current_TE=[]
else:
final_TE = get_unique_sre(current_TE)
for item in final_TE:
print>>refined, item
current_TE=[]
refined.close()