假设以下Python函数从字符串中删除破折号(“gap”),同时在此字符串上保留正确的注释。输入变量 instring 和 annotations 分别构成字符串和字典。
def DegapButMaintainAnno(instring, annotations):
degapped_instring = ''
degapped_annotations = {}
gaps_cumulative = 0
for range_name, index_list in annotations.items():
gaps_within_range = 0
for pos, char in enumerate(instring):
if pos in index_list and char == '-':
index_list.remove(pos)
gaps_within_range += 1
if pos in index_list and char != '-':
degapped_instring += char
index_list[index_list.index(pos)] = pos - gaps_within_range
index_list = [i-gaps_cumulative for i in index_list]
degapped_annotations[range_name] = index_list
gaps_cumulative += gaps_within_range
return (degapped_instring, degapped_annotations)
如果输入字典指定的范围都没有重叠,则所述函数按预期工作:
>>> instr = "A--AT--T"
>>> annot = {"range1":[0,1,2,3,4], "range2":[5,6,7]}
>>> DegapButMaintainAnno(instr, annot)
Out: ('AATT', {'range1': [0, 1, 2], 'range2': [3]})
但是,只要一个或多个范围重叠,代码就会失败:
>>> annot = {"range1":[0,1,2,3,4], "range2":[4,5,6,7]}
>>> DegapButMaintainAnno(instr, annot)
Out: ('AATTT', {'range1': [0, 1, 2], 'range2': [2, 3]}) # See additional 'T' in string
是否有人建议如何更正重叠范围的代码?
答案 0 :(得分:0)
我认为你可能会过度思考问题。这是我的尝试:
from copy import copy
def rewriteGene(instr, annos):
annotations = copy(annos)
index = instr.find('-')
while index > -1:
for key, ls in annotations.items():
if index in ls:
ls.remove(index)
annotations[key] = [e-1 if e > index else e for e in ls]
instr = instr[:index] + instr[index+1:]
index = instr.find('-')
return instr, annotations
instr = "A--AT--T"
annos = {"range1":[0,1,2,3,4], "range2":[4,5,6,7]}
print rewriteGene(instr, annos)
# ('AATT', {'range2': [2, 3], 'range1': [0, 1, 2]})
它应该是非常易读的,但是如果你想澄清任何事情,请告诉我。