逆转录

时间:2018-07-10 21:51:24

标签: python genetic

目标是在给出完整的mRNA序列和氨基酸序列的情况下获得mRNA的编码序列。然后将所有这些都以密码子格式输入。 我觉得我已经找到了可能的清单密码子。我只是不确定如何与给定的mRNA序列进行系统匹配。所以这就是我到目前为止所拥有的。

xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'

xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'

d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'], 'AA': ['F','F','L','L','S','S','S','S','Y','Y','_','_','C','C','_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I','M','M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}

AA= pandas.DataFrame(data=d)
for i in xAA:
    codons = list(AA.mRNA.loc[AA['AA'] == i])
    print codons

这是输出:

['AUA', 'AUG']
['GAU', 'GAC']
['UUU', 'UUC']
['UUU', 'UUC']
['GCU', 'GCC', 'GCA', 'GCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CCU', 'CCC', 'CCA', 'CCG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GUU', 'GUC', 'GUA', 'GUG']
['ACU', 'ACC', 'ACA', 'ACG']
['GAA', 'GAG']
['GAA', 'GAG']
['ACU', 'ACC', 'ACA', 'ACG']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAA', 'GAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GUU', 'GUC', 'GUA', 'GUG']
['GCU', 'GCC', 'GCA', 'GCG']
['AUA', 'AUG']
['AUU', 'AUC']
['AAA', 'AAG']
['GAA', 'GAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GAU', 'GAC']
['ACU', 'ACC', 'ACA', 'ACG']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['AUU', 'AUC']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['CCU', 'CCC', 'CCA', 'CCG']
['ACU', 'ACC', 'ACA', 'ACG']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['GGU', 'GGC', 'GGA', 'GGG']
['GAU', 'GAC']
['GUU', 'GUC', 'GUA', 'GUG']
['AUU', 'AUC']
['UAU', 'UAC']
['AAA', 'AAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UUU', 'UUC']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['ACU', 'ACC', 'ACA', 'ACG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['AUU', 'AUC']
['AUU', 'AUC']
['ACU', 'ACC', 'ACA', 'ACG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['AAU', 'AAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['CAA', 'CAG']
['AAU', 'AAC']
['AUA', 'AUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['UUU', 'UUC']
['UAU', 'UAC']
['AUU', 'AUC']
['CCU', 'CCC', 'CCA', 'CCG']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['GGU', 'GGC', 'GGA', 'GGG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['CAA', 'CAG']
['GUU', 'GUC', 'GUA', 'GUG']
['AUA', 'AUG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAU', 'GAC']
['GAA', 'GAG']
['AAA', 'AAG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['AAU', 'AAC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['CCU', 'CCC', 'CCA', 'CCG']

如果我添加此处显示的for c循环,我将得到

codingseq = ""
for i in xAA:
    codons = list(AA.mRNA.loc[AA['AA'] == i])
    for c in codons:
        xmRNA.find(c)
        codingseq+= c

这给每种组合都提供了一种比较分析的方法,以找出其中哪一个最像完整的mRNA序列?

AUAAUG
AUAAUGGAUGAC
AUAAUGGAUGACUUUUUC
AUAAUGGAUGACUUUUUCUUUUUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUACAAAAAG

请注意,由于超出字符数限制,因此未显示所有结果。 任何帮助都会很棒!

2 个答案:

答案 0 :(得分:0)

这将在xmRNA中找到与xAA匹配的密码子序列。注意d [“ AA”]数据已在索引34(“ M”替换为“ I”)处进行了校正,以匹配http://web.expasy.org/网站上使用的翻译。我没有使用pandas,只是普通的Python。我只是做一个简单的测试,试图找出在xmRNA中xAA的位置(以及使用的密码子)。即使对于很长的序列,它也应该足够快(即使100000个RNA也应该是即时的)。

xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'

d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'],
       'AA': ['F',  'F',  'L',  'L',  'S',  'S',  'S',  'S',  'Y',  'Y',  '_',  '_',  'C',  'C',  '_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I',   'I'   ,'M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}

r2a = { d['mRNA'][i] : d['AA'][i] for i in range(len(d['AA'])) }

s=0
found = False
for s in (0,1,2):
    # t3 = codon sequence starting from s
    t = list(xmRNA[s:])
    t3 = [ t[i]+t[i+1]+t[i+2] for i in range(0,len(t)-2,3) ]
    # transcribe to AAs
    aa = [ r2a[i] for i in t3 ]
    aa = ''.join(aa)
    print (aa)
    try:
        idx = aa.index(xAA)
    except ValueError:
        continue

    # found it
    pos = idx*3 + s
    t3 = t3[idx:idx+len(xAA)]
    found = True
    break

if found:
    print ("found in frame {} at {} (pos={})".format(s+1, idx, pos))
    print ("codons: " + repr(t3))
else:
    print ("Not found")

答案 1 :(得分:0)

这是一个非常紧凑的解决方案。创建反向字典时没有任何冲突,因此可以执行以下操作:

codon_dict = {'F': ('UUU', 'UUC'),
              'L': ('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'),
              'S': ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'),
              'Y': ('UAU', 'UAC'),
              '_': ('UAA', 'UAG', 'UGA'),
              'C': ('UGU', 'UGC'),
              'W': ('UGG',),
              'P': ('CCU', 'CCC', 'CCA', 'CCG'),
              'H': ('CAU', 'CAC'),
              'Q': ('CAA', 'CAG'),
              'R': ('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'),
              'I': ('AUU', 'AUC', 'AUA'),
              'M': ('AUG',),
              'T': ('ACU', 'ACC', 'ACA', 'ACG'),
              'N': ('AAU', 'AAC'),
              'K': ('AAA', 'AAG'),
              'V': ('GUU', 'GUC', 'GUA', 'GUG'),
              'A': ('GCU', 'GCC', 'GCA', 'GCG'),
              'D': ('GAU', 'GAC'),
              'E': ('GAA', 'GAG'),
              'G': ('GGU', 'GGC', 'GGA', 'GGG')
             }

rna_dict = {}
for k, v in codon_dict.items():
    for val in v:
        rna_dict[val] = k

xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'

mapped = [rna_dict[x] for x in [xmRNA[i:i+3] for i in range(0, len(xmRNA)-1, 3)]]

xmRNA_index = "".join(mapped).find(xAA) * 3
print(xmRNA_index)

这将返回486,这是xAA密码子所在的位置。可以通过对字符串的范围检查进一步充实它,并且如果您只想要最接近的匹配项,则可以对子字符串进行组合匹配(我仍然会进行反向映射,然后在该空间中搜索,虽然更快),但是是的。我也不知道没有映射的常见缺失字符/三胞胎,因此在这种情况下可能需要修改。