我的脚本正在运行,但我无法以表格格式打印输出。我有两个文件 - 我读取 file1 序列并查看它们中的任何一个是否与 file2 中的任何序列绑定。
file1.fa
>seq1
ACCGG
>seq2
AATTTC
file2.fa
>seq3
CCGGT
>seq9
GGGGGGCCC
电流输出:
---
file1.seq
>seq1
ACCGG
file2.seq
>seq3
CCGGT
基于文件,只有 seq1
与 seq3
绑定,并且基于我的代码的当前输出是正确的,但随后我必须以显示每个序列 ID 的格式获得我想要的输出- 如果他们交互,则标记为 1,否则标记为 0。
我需要帮助修复 print
部分以得到如下表格格式的输出:
file2 seq3 seq9
file1
seq1 1 0
seq2 0 0
我的代码:
from Bio import SeqIO
from Bio.Seq import Seq
records=list(SeqIO.parse("file1.fa","fasta"))
window_size = 5
step_size = 1
target_records=list(SeqIO.parse("file2.fa","fasta"))
for i in records:
for j in range(0, len(i.seq)-window_size+1):
for k in (i.seq.reverse_complement()[j: j+5].split()):
for l in target_records:
if l.seq.find(k)!=-1:
print('---\n{}\n>{}\n{}\n{}\n>{}\n{}'.format ("file1.seq", i.id,i.seq, "file2.seq", l.id,l.seq))
谢谢
答案 0 :(得分:1)
当我想获取表格格式时,我想使用pandas:
例如:
from Bio import SeqIO
import pandas as pd
def same_seq(a_record, brecord):
window_size = 5
step_size = 1
for j in range(0, len(a_record.seq) - window_size + 1):
for k in (a_record.seq.reverse_complement()[j: j + 5].split()):
return brecord.seq.find(k) != -1
if __name__ == '__main__':
records = list(SeqIO.parse("file1.fa", "fasta"))
target_records = list(SeqIO.parse("file2.fa", "fasta"))
rows_list = []
for target_record in target_records:
new_row = {'name': target_record.name}
for record in records:
if same_seq(record, target_record):
new_row[record.name] = 1
else:
new_row[record.name] = 0
rows_list.append(new_row)
df = pd.DataFrame(rows_list)
df = df.set_index(["name"])
print(df)
seq1 seq2
name
seq3 1 0
seq9 0 0
如果你想改变seq的比较:
你应该使用 seq[j: j+5].reverse_complement()
但 seq.reverse_complement()[j: j+5]
不知道我理解对不对
file1.fa
>seq1
ACCGG
>seq2
AATTTC
>seqtest1
NNNACCGTGCNN
file1.fa
>seq1
ACCGG
>seq2
AATTTC
>seqtest1
NNNACCGTGCNN
file2.fa
>seq3
CCGGT
>seq9
GGGGGGCCC
>seqtest2
NNCACGGTNN
from Bio import SeqIO
import pandas as pd
def same_seq_window(a_record, b_record, window_size):
for i in range(len(a_record.seq)- window_size + 1):
a_seq = a_record.seq[i: i + window_size]
if b_record.seq.find(a_seq.reverse_complement()) != -1:
return True
return False
def same_seq(a_record, b_record):
window_sizes = range(5, len(a_record.seq)+1)
for window_size in window_sizes:
if same_seq_window(a_record, b_record, window_size):
return True
return False
if __name__ == '__main__':
records = list(SeqIO.parse("file1.fa", "fasta"))
target_records = list(SeqIO.parse("file2.fa", "fasta"))
rows_list = []
for target_record in target_records:
new_row = {'name': target_record.name}
for record in records:
if same_seq(record, target_record):
new_row[record.name] = 1
else:
new_row[record.name] = 0
rows_list.append(new_row)
df = pd.DataFrame(rows_list)
df = df.set_index(["name"])
print(df)
seq1 seq2 seqtest1
name
seq3 1 0 0
seq9 0 0 0
seqtest2 0 0 1