我有2个带有序列的fasta文件。我想将第二个文件中的序列与第一个文件对齐并报告身份
例如:
File1中:
>s1
aaccggactggacatccg
>s2
gtcgactctcggaattg
....
文件2:
>a1
actg
>a2
tccg
.....
我想获取file2序列并查看file1并使用大写不匹配的基数和csv格式的标识打印匹配
输出
name,a1_alignment,a1_identity,a2_alignment,a2_identity
s1,actg,100,tccg,100
s2,aCtg,95,tcCg,95
这是我到目前为止所做的:
import sys
import os,csv
from Bio import SeqIO
from itertools import *
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--mismatch_threshold", dest="mismatch_threshold", default = 2,
help="This is the number of differences you'll allow between the actualread and your sequence of interest. Default is 2")
(options, args) = parser.parse_args()
if len(sys.argv) != 4:
print "Usage : python search.py <file1> <file2> <fout>"
sys.exit()
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
fout = open(sys.argv[3],'w')
writer = csv.writer(fout)
def long(f1):
for record in SeqIO.parse(f1,'fasta'):
header = record.name
sequence = record.seq
yield [header, sequence]
def short(f2):
for record in SeqIO.parse(f2,'fasta'):
head = record.name
seq = record.seq
return seq
def alignment(sequence,seq,mismatch_threshold):
l1 = len(sequence)
l2 = len(seq)
alignment = []
for i in range(0,min(l1,l2)):
if sequence[i] == seq[i]:
alignment.append(i)
else:
mismatch = sum( c1 != c2 for c1,c2 in zip(sequence,seq))
if mismatch <= mismatch_threshold:
alignment.append(i)
k = 0
l = 0
for read in alignment:
for letter in read:
if letter == isupper():
pass
else:
if letter == alignment[0].seq[j]:
l +=1
k += 1
k = 0
length = seq
percent = 100*l/len(seq)
#print percent
yield percent
longsequences = long(open(sys.argv[1],'r'))
shortsequences = short(open(sys.argv[2],'r'))
align = alignment(longsequences,shortsequences,options.mismatch_threshold)
for name in head:
writer.writerow(( name +'_alignment' , name + '_identity'))
for s in align:
# print to csv file
我需要帮助查看file1中不匹配的file2序列并打印对齐以及计算身份百分比
错误:
File "s.py", line 34, in alignment
l1 = len(sequence)
TypeError: object of type 'generator' has no len()