我在尝试编辑有效的python脚本时发现困难。
我有2个文件:
此脚本的目的是比较两个文件,并且第一个文件中的ID与第二个文件中的序列及其ID匹配后,输出应为ID,完整序列及其ID。
我在这里获得的脚本将获取第一个文件的ID和仅包含序列ID的序列文本作为输出。
这是脚本:
with open('uniprot_reviewed_taxonomy_9606.fasta', 'r') as f:
for line in f.readlines():
line = line.replace("\n", "")
if line.startswith('>'):
full_name = line.split('|')
accession_x = full_name[1]
print(accession_x)
else:
print (line)
with open('homosapiens_output1.txt', 'r') as f1:
for line1 in f1.readlines()[1:]: # ignores the first line
line1 = line1.replace("\n", "")
full_name1 = line1.split(' ')
accession_y = full_name1[0].replace(" ", "")
accession_z = full_name1[1].replace(" ", "")
main_accession = accession_x + " " + accession_z + " " + accession_y + " " + line
if accession_x == accession_z:
print(main_accession)
所以您可以通过编辑脚本来帮我吗?输出的内容可以是ID,Fasta序列及其ID吗?
答案 0 :(得分:0)
这是一个小测试示例,该怎么做。
代码:
# create test fasta:
test_content = """>tr|Q53XC5|Q53XC5_HUMAN Bone morphogenetic protein 4 OS=Homo sapiens OX=9606 GN=BMP4 PE=2 SV=1
MIPGNRMLMVVLLCQVLLGGASHASLIPETGKKKVAEIQGHAGGRRSGQSHELLRDFEAT
LLQMFGLRRRPQPSKSAVIPDYMRDLYRLQSGEEEEEQIHSTGLEYPERPASRANTVRSF
HHEEHLENIPGTSENSAFRFLFNLSSIPENEVISSAELRLFREQVDQGPDWERGFHRINI
YEVMKPPAEVVPGHLITRLLDTRLVHHNVTRWETFDVSPAVLRWTREKQPNYGLAIEVTH
LHQTRTHQGQHVRISRSLPQGSGNWAQLRPLLVTFGHDGRGHALTRRRRAKRSPKHHSQR
ARKKNKNCRRHSLYVDFSDVGWNDWIVAPPGYQAFYCHGDCPFPLADHLNSTNHAIVQTL
VNSVNSSIPKACCVPTELSAISMLYLDEYDKVVLKNYQEMVVEGCGCR
>tr|A8K571|A8K571_HUMAN Bone morphogenetic protein 7 (Osteogenic protein 1), isoform CRA_b OS=Homo sapiens OX=9606 GN=BMP7 PE=2 SV=1
MHVRSLRAAAPHSFVALWAPLFLLRSALADFSLDNEVHSSFIHRRLRSQERREMQREILS
ILGLPHRPRPHLQGKHNSAPMFMLDLYNAMAVEEGGGPGGQGFSYPYKAVFSTQGPPLAS
LQDSHFLTDADMVMSFVNLVEHDKEFFHPRYHHREFRFDLSKIPEGEAVTAAEFRIYKDY
IRERFDNETFRISVYQVLQEHLGRESDLFLLDSRTLWASEEGWLVFDITATSNHWVVNPR
HNLGLQLSVETLDGQSINPKLAGLIGRHGPQNKQPFMVAFFKATEVHFRSIRSTGSKQRS
QNRSKTPKNQEALRMANVAENSSSDQRQACKKHELYVSFRDLGWQDWIIAPEGYAAYYCE
GECAFPLNSYMNATNHAIVQTLVHFINPETVPKPCCAPTQLNAISVLYFDDSSNVILKKY
RNMVVRACGCH
>tr|A8K660|A8K660_HUMAN Adiponectin OS=Homo sapiens OX=9606 GN=ADIPOQ PE=2 SV=1
MLLLGAVLLLLALPGHDQETTTQGPGVLLPLPKGACTGWMAGIPGHPGHNGAPGRDGRDG
TPGEKGEKGDPGLIGPKGDIGETGVPGAEGPRGFPGIQGRKGEPGEGAYVYRSAFSVGLE
TYVTIPNMPIRFTKIFYNQQNHYDGSTGKFHCNIPGLYYFAYHITVYMKDVKVSLFKKDK
AMLFTYDQYQENNVDQASGSVLLHLEVGDQVWLQVYGEGERNGLYADNDNDSTFTGFLLY
HDTN
"""
with open('test_sequences.fasta', 'w') as f:
f.write(test_content)
# create test ids:
test_ids = 'A8K660\nQ53XC5\n'
with open('test_ids.txt', 'w') as f:
f.write(test_ids)
# Load all sequences and store them in dict:
with open('test_sequences.fasta', 'r') as f:
lines = f.read().split('>')
sequences = {}
for seq in lines:
if seq:
id_ = seq[: seq.find('\n')].split('|')[1]
seq = seq[seq.find('\n')+1:]
sequences[id_] = seq
# import ids:
with open('test_ids.txt', 'r') as f:
ids = f.readlines()
ids = [id_.strip() for id_ in ids] # remove \n from id end
# checking ids
# and filter out those that are not in sequences dict
filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]
# writing new file with filtered sequences:
with open('filtered_ids.txt', 'w') as f:
for id_ in filtered_ids:
f.write('>|' + id_ + '\n')
f.write(sequences[id_])
# the final function:
def ids_filter(ids_file, seq_file, out_file):
with open(seq_file, 'r') as f:
lines = f.read().split('>')
sequences = {}
for seq in lines:
if seq:
id_ = seq[: seq.find('\n')].split('|')[1]
seq = seq[seq.find('\n')+1:]
sequences[id_] = seq
with open(ids_file, 'r') as f:
ids = f.readlines()
ids = [id_.strip() for id_ in ids]
filtered_ids = [id_ for id_ in ids if id_ in sequences.keys()]
with open(out_file, 'w') as f:
for id_ in filtered_ids:
f.write('>|' + id_ + '\n')
f.write(sequences[id_])