raw_seq = '''GGACUAGCGGAGGCUAGUCC
METGLNLYSGLYASNPHEARGASNGLNARGLYSTHRVAL
LYSCYSPHEASNCYSGLYLYSGLUGLYHISILEALALYS
ASNCYSARGALAPROARGLYSLYSGLYCYSTRPLYSCYS
GLYLYSGLUGLYHISGLNMETLYSASPCYSTHRGLUARG
GLNALAASN'''
ascodes = ['ALA','ARG','ASN','ASP','ASX','CYS','GLU','GLN','GLX','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']
for amino in ascodes:
if amino in raw_seq:
print(amino)
我的代码按字母顺序返回氨基酸序列,这破坏了其所有生物学功能。我也尝试过regex,但无法提出合适的模式。
答案 0 :(得分:1)
有点技巧,但将re.findall
与str.join
一起使用,以确保输出按raw_seq
的出现顺序排列:
import re
re.findall('|'.join(ascodes), raw_seq)
输出:
['MET',
'GLN',
'LYS',
...
'ARG',
'GLN',
'ALA',
'ASN']
答案 1 :(得分:0)
您可以遍历所有字符,并检查氨基酸列表中是否包含当前字符和以下2个字符。
for i in range(len(raw_seq)):
amino = raw_seq[i:i+3]
if amino in ascodes:
print(amino)
哪个给出以下结果:
['MET','GLN','LYS','GLY','ASN','PHE','ARG','ASN','GLN','ARG','GLY','LYS ','THR','VAL','LYS','CYS','PHE','ASN','CYS','GLY','LYS','GLU','GLY','HIS', 'ILE','ALA','LYS','ASN','CYS','ARG','ALA','PRO','ARG','GLY','LYS','LYS','GLY ','CYS','TRP','LYS','CYS','GLY','LYS','GLU','GLY','HIS','GLN','MET','LYS', 'ASP','CYS','THR','GLU','ARG','GLN','ALA','ASN']
答案 2 :(得分:0)
raw_seq = '''GGACUAGCGGAGGCUAGUCC
METGLNLYSGLYASNPHEARGASNGLNARGLYSTHRVAL
LYSCYSPHEASNCYSGLYLYSGLUGLYHISILEALALYS
ASNCYSARGALAPROARGLYSLYSGLYCYSTRPLYSCYS
GLYLYSGLUGLYHISGLNMETLYSASPCYSTHRGLUARG
GLNALAASN'''
ascodes = ['ALA','ARG','ASN','ASP','ASX','CYS','GLU','GLN','GLX','GLY','HIS','ILE','LEU','LYS','MET','PHE','PRO','SER','THR','TRP','TYR','VAL']
raw_seq = raw_seq.replace('\n','')
sep_set =[ raw_seq[i:i+3] for i in range(len(raw_seq)-2)]
result =[i for i in sep_set if i in ascodes]
"""
output
['MET', 'GLN', 'LYS', 'GLY', 'ASN', 'PHE', 'ARG', 'ASN', 'GLN', 'ARG', 'GLY', 'LYS', 'THR', 'VAL', 'LYS', 'CYS', 'PHE', 'ASN', 'CYS', 'GLY', 'LYS', 'GLU', 'GLY', 'HIS', 'ILE', 'ALA', 'LYS', 'ASN', 'CYS', 'ARG', 'ALA', 'PRO', 'ARG', 'GLY', 'LYS', 'LYS', 'GLY', 'CYS', 'TRP', 'LYS', 'CYS', 'GLY', 'LYS', 'GLU', 'GLY', 'HIS', 'GLN', 'MET', 'LYS', 'ASP', 'CYS', 'THR', 'GLU', 'ARG', 'GLN', 'ALA', 'ASN']
"""