我有一个巨大的输入文件,看起来像这样,
contig protein start end
con1 P1 140 602
con1 P2 140 602
con1 P3 232 548
con2 P4 335 801
con2 P5 642 732
con2 P6 335 779
con2 P7 729 812
con3 P8 17 348
con3 P9 16 348
我想删除同源P或冗余P,我假设它们分别具有相同的起始和终点以及具有较小起点或终点的那些。所以我的输出文件将是这样的, file.txt的
con1 P1 140 602
con1 P3 232 548
con2 P4 335 801
con2 P7 729 812
尝试过的脚本,出于某种原因,它不符合这两个条件,
from itertools import groupby
def non_homolog(hits):
nonhomolog=[]
overst = False
for i in range(1,len(hits)):
(p, c) = hits[i-1], hits[i]
if p[2] <= c[2] and c[3] <= p[3]:
if not overst: nonhomolog.append(c)
nonhomolog.append(c)
overst = True
return nonhomolog
fh = open('example.txt')
oh = open('nonhomologs.txt', 'w')
for qid, grp in groupby(fh, lambda l: l.split()[0]):
hits = []
for line in grp:
hsp = line.split()
hsp[2], hsp[3] = int(hsp[2]), int(hsp[3])
hits.append(hsp)
hits.sort(key=lambda x: x[2])
if non_homolog(hits):
for hit in hits:
oh.write('\t'.join([str(f) for f in hit])+'\n')
答案 0 :(得分:1)
尝试使用此尺寸:
# this code assumes Python 2.7
from itertools import groupby, izip
from operator import attrgetter
INPUT = "file.txt"
HOMO_YES = "homologs.txt"
HOMO_NO = "nonhomologs.txt"
MAX_DIFF = 5
class Row:
__slots__ = ["line", "con", "protein", "start", "end"]
def __init__(self, s):
self.line = s.rstrip()
data = s.split()
self.con = data[0]
self.protein = data[1]
self.start = int(data[2])
self.end = int(data[3])
def __str__(self):
return self.line
def count_homologs(items, max_diff=MAX_DIFF):
num_items = len(items)
counts = [0] * num_items
# first item
for i, item_i in enumerate(items):
max_start = item_i.start + max_diff
max_end = item_i.end + max_diff
# second item
for j in xrange(i+1, num_items):
item_j = items[j]
if item_j.start > max_start:
break
elif item_j.end <= max_end:
counts[i] += 1
counts[j] += 1
return counts
def main():
with open(INPUT) as inf, open(HOMO_YES, "w") as outhomo, open(HOMO_NO, "w") as outnothomo:
# skip header
next(inf, '')
rows = (Row(line) for line in inf)
for con, item_iter in groupby(rows, key=attrgetter("con")):
# per-con list of Rows sorted by start,end
items = sorted(item_iter, key=attrgetter("start", "end"))
# get #homologs for each item
counts = count_homologs(items)
# do output
for c,item in izip(counts, items):
if c:
outhomo.write(str(item) + "\n")
else:
outnothomo.write(str(item) + "\n")
if __name__=="__main__":
main()
对您的给定数据产生:
=== homologs.txt ===
con1 P1 140 602
con1 P2 140 602
con3 P9 16 348
con3 P8 17 348
=== nonhomologs.txt ===
con1 P3 232 548
con2 P6 335 779
con2 P4 335 801
con2 P5 642 732
con2 P7 729 812