当我尝试对齐多个文件时遇到了问题
这是我的剧本:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import ClustalOmegaCommandline
def divergence(fic1dna,fic2dna,fic1prot,fic2prot):
from Bio import SeqIO
seq1dna = list(SeqIO.parse(fic1dna, "fasta",alphabet=IUPAC.IUPACUnambiguousDNA()))
seq2dna = list(SeqIO.parse(fic2dna, "fasta",alphabet=IUPAC.IUPACUnambiguousDNA()))
seq1prot = list(SeqIO.parse(fic1prot, "fasta",alphabet=IUPAC.protein))
seq2prot= list(SeqIO.parse(fic2prot, "fasta",alphabet=IUPAC.protein))
u=0
while u < len(seq1dna): # make an alignment betwen each element on 2 files for 2 paires files
nuc1=str(seq1dna[u].seq)
nuc2=str(seq2dna[u].seq)
prot1=str(seq1prot[u].seq)
prot2=str(seq2prot[u].seq)
prot1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein),id='pro1')
prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein),id='pro2')
aln = MultipleSeqAlignment([prot1, prot2])
print(aln)
u+=1
print(divergence("concatenate_0035_fna_renamed.fst","concatenate_0042_fna_renamed.fst","concatenate_0035_faa_renamed.fst","concatenate_0042_faa_renamed.fst"))
所以,你可以看到我有4个文件,对应于来自2个物种的244个序列,我需要为它们中的每一个计算dN dS,所以,我需要在密码子比对中对齐每个配对的seq。
但是,当我试图调整我的244个蛋白质序列时,错误“ValueError(”序列必须都是相同的长度“)引发”
我不知道为什么脚本不接受具有不同长度的序列,因为所有其他程序都这样做。
短输入将是:
来自sp 1
的AA seq的一个文件>EOG090X005Q
CEHNTAGRDCEKCLDFYNDAPWGRASPTNVHECKACNCNGFSNKCYFDKDLYERTGHGGHCIDCEENRDGANCERCKENFYQGMEDICLPCNCNPTGSRSLQCNAEGKCQCKPGVTGDKCDVCAPNYFEFTMHGCKPCDCNVSGSYGNTPQCDPQTGVCLCKQNVEGRRCRECKPGFFNLDVENEFGCTPCFCFGHSSQCSSAPKYQAHEISAHYIRDAEKWGAEDDQRKPVQLQFNANTQNIAVASKGSEILYFLASGQFLGDQRPSYNHDLKFTLRLGESGGYPSSQDIILEGARSSVSMNIYAQNNPEPSDVAQEYSFRLHEDPRYGWTPTLSNFEFMSILQNLTAIKIRGTYNKGGVGYLINFKLETAKIGREKGSAPANWVEKCSCPKAYVGDYCEECAPGYKHEPANGGPYSTCIPCDCNGHAHICDTATGFCICKHNTTGSNCELCAKGFYGNAIAGTADDCKPCPCPKDSGCIQLMDQSIVCTDCPVGYAGPRCEVCADAHFGDPTGQFGAPQECEECQCNGNVDPNAVGNCNRTTGECLKCIYNTAGEHCDKCLSGYFGDALDQKKKGDCKPCQCLEAGTVESPEGARKAPLCDGLTGFCSCRPHVIGRNCDKCEVDLNCIAVLKT
>EOG090X00BV
MNAHFPQNEIARSEAYNIMSVRKQYLVPKDGTPLSGLIQDHVISGVKMSIRGAFFTKADYQQLVFQALSNHKGEIKLLPPTILKPIMLWSGKQILSTIIINSIPKGKPYLSLTGKAKISSKAWQKEPARTWNAGGTPFTNPNSMSEAEVIIRKGELLCGVLDKTHYGATPYGLVHCMYELYGGDSSSALLSSFSKVFTFYLQWIGFTLGVKDILVVEEADKQRDNFINLVRKVGKVAAAKATELPVDVDELKLKETISEMLIKDPKFRANLDRQYKSLLDSYTNNINTVCLSEGLLEKFPYNNLQLMVQSGAKGSTVNTMQISCLLGQIELEGKRPPLMISGRSLPSFPPYDISPRAGGFIDGRFMTGIQPQEFFFHCMAGREGLIDTAVKTSRSGYLQRCLIKHLEGLSVAYDHTVRDSDSSVIQFAYGEDGLDVIKCQYFNKDQFEFLDVNSNAVISKSAIKKLKEDDKSKALAKSQKSLKKWKKKNGNPFEKVRYSPFTEFSAIAKNDIVLDDKPTDQTRDPNYWELEKMWRNLDADEKKQYARKRCPDPIPSKYSPEYKFGVINEQLNELTQNYLKNRKEHMYSDYTDKDKFTEIINAKYLASMAAPGEPVGLLAAQSIGEPSTQMTLNTFHFAGRGDMNVTLGIPRLREILMTASAKLKTPSMDIPFRSDLPDLNKKAERLRQKMNRVTVSDVLEKIDVHCEIVTNPNRQLKTVMRFSFLPHSQYKVQYTVKPAQIIKHMQNKFFSEMFSIIRKQAKTTCGVMWSTEKEKKRRAASDEDDEDGEGASPDVAEKAVNMDEDSSDEEGPNDDDDNTDVS
和另一个物种2:
>EOG090X005Q
MGGKIAAILLFAFFTSGSRSEPDFVDGQFNKINKNRVEVKCYDDFGAPQRCIPPFENAAFGVLMEATNTCGQDGRPTEFCRQTGVQRKPCEFCHPGDHPASFLTDRDNNDNATWWQSETMHEGIEYPNKVVLTLNLGKTYDITYVRVLFESPRPESWGIFRRRTEDSPWEPYQFYSATCRDTYGLPDRKDTVRGEDTRVLCTSEYSDISPLRRGTVAFSTLEGRPSAFQFDTNPALQSWVQATDLRLSLDRPNTFGDELFGDGQVLKSYYYAIADVAVGARCACNGHAGECINSPHTNGTTRRVCRCEHNTAGPDCNECLPFYNDAPWGRATTTDAHECKPCNCNGYSDRCYFDKDLYERSGHGGHCTDCRANRAGPNCERCRENFYQRLEDSYCVACNCNEIGSRSLQCNSEGKCQCKPGITGDKCDRCAANFFNFDSLGCTSCECSPKGSLDNEPNCDPVSGACVCKENVEGKRCRECRPGFFNLDLDNEFGCTPCFCYGHSSVCNLANGYSKLTIESMFGRGNEKWTASVAGNPIPLHYDAVTQTISVNAPDRDNVYFVAPERFLGDQRASYNQDLTFTLRIAENEPAPTARDVILEGGNGEQLTQPIFGQTNQLPNASPQVYKFRLNEHADYGWEPRVTSRAFMSVLSNLTAIKIRGTYTHQGRGFLDDVSLETAQRGAAGEPADWIEHCQCPHGYVGQFCESCAPGFHHDPPNGGPFSLCVPCNCNGHADICEAETGQCICHHNTAGSNCDLCSRGFYGYPLKGTPHDCKPCPCPDNGPCILLGNNPDPICSECPSGRTGARCETCSDGYFGNPDQGQACRLCDCNNNIDLNAVRNCNHETGECLKCVNNTAGFHCEDCLSGYWGDALSERKEDSCKLCQCYPPGTIELDDGSVAPCNQLTGHCACKPHVIGRNCDKCEDGYYQILSGDGCTACNCDPEGSYNRTCDATTGQCECRPGITGKRCDTCLPYQFGFGRDGCKHCDCDTIGSQELQCDASGQCPCLTNVEGRRCDRCKENKYNRQYGCIDCPPCYNLIQDSVNQHRRRLNELESTLRKINNSPTVMKDSDFEKELKNVENRVKSLLQVAKQGSGNENKTLVEQLDELRDQLNQIEKISQSVDATAEDARRTTNEGLTSIEEAERVLDQIYEQLTEAEDYLATDGARALAAAKKRADQVGQQNQQMTIIAQEARVLADLNTNEAKKIHVLAEQARNTSLEAYNLAKKAIAKYSNISDEIRGLENKLELLEDRFNEVKNLTAAAVAKSAAVDKEALQLLILDLRVPAVDTNELRILLETVSVDGSEIKEQAQLLLGQNEAWLNELANKARKSEELLERAQDQQAATADLLSEVDGANEKAKDALKRGNQTLVEAQETLKKLGEFDAEVQKERIKAQEALTVLEEIKDMVNEAIAKANETESVLKDAESNAIAAKDIAIQAQVSNNADEASANANLIRQEANKTKLDAVRLGNEADKLHLRVEITNSIAKKHEARVDKDVNATNEVNHQVGQARNSLNLAGQQVDKALAEVDEIIKELDVLPEIDDADLDRLEERLLAAEKEIEEANLEKRIRELTEAKNLQTQWVKNYEDEVSRLRLEVENIDDIRKALPSICYKRLRLEP
>EOG090X00BV
MFSIFTASDVRNLSVLKISTPLSFNILGHPLKGGLYDPALGPLNDRSDPCGTCGEGTIQCMGHFGHIELPVPVVNPLFHKVLTSLLKLSCLKCYTLQIPSYLKLLLNGKLRLMEEGFSNDIPGLEQEVGSAVAGMNRIAEGELEFISDIIEAYIEMTCNQRHHVQSGKSKESTSTRTLNMEWHHYIESVVKTCKASKLCINCRNPIPKMTILKNKILTNHVVNNEDTMMEDRVIHKLETSFMTPDQSKKHLRGLWQKEADILRIIIPCLGSVDLEFPTDVFFFEIIPVLPPITRPVNMLDNQLVEHPQSQVYKSIIQDCLVLRNIIQTIQDGDTTQLPEEGRAVFDEIRGDNAAEKLHHAWTTLQSNVDHLMDREMSKTTESANCHGLKQVIEKKEGIIRMHMMGKRVNYAARSVITPDPNLNIDEIGVPEAFALKLTYPVPVTPWNVTELRKLIINGPEIHPGAVMIEGEDGFVKLLRGDDKTQLEAIAKRLLTSSRKPFSGIKIVHRHLQNGDMLLLNRQPTLHKPSIMAHKARILKGEKTLRLHYANCKAYNADFDGDEMNAHFPQNELARSEGYFIANVSNQYLVPKDGTPLGGLIQDHVISGVRLTLRGNFFNRQDYMQLVYSAIADTTGDLILLPPTILKPVRLWSGKQIISTVIINLTPRGRAPINLKASAKISVKDWQVKKARKWKCGQEFTDQRTMSEAEVVIRGGELLSGVLDKTHYGATPYGLIHCLFELYGGTCSSKVLSAFGKLFQTYLQISGFTLGVEDILVVRKSDQKRREIIEACRQIGDQIQTATVELPPGTSEEQVKSKMEESYAKDPKFRAIVDRKYKSALDVFTNNINKTCLPAGLLKKFPHNNLQLMVQSGAKGSTVNTMQISCLLGQIELEGKRPPLMINGKSLPSFPAYDSSPRSGGFIDGRFMTGIQPQEFFFHCMAGREGLIDTAVKTSRSGYLQRCLIKHLEGLTVNYDSTVRDSDGSLIQMSYGEDGLDIPNSRFLRKEELDFLVENRKAIVDPALVEHLKDETTEKIRKINKKIRKWRTKHGNGSTKWRNSEFAKFSEINRNSGSSKNRQINSNCGRTKAALSLMKKWIRADEEVKKKLKDECVRCPDPVTSIFRQDLQFGVLTEKMEALMEEYLDEKSRRFTTSIGKEEVRDLLCTKIMKSLCPPGEPVGLLAAQSIGEPSTQMTLNTFHFAGRGEMNVTLGIPRLREILMMASKNIKTPSMEIPFRTDLPNVENQATKLQLKLTKCYLSNILKNIKLDRKLEENPNRQLTFTLTVNCLPHKFYKNEYCVKPHNVLNEIERNFFKLFFRAIKKIGKATGTLLHIEEEKSSSREDDAMLDTGEPDETEAKPNRSDLGELHESSDEDEAAEDADATASRSIARHRENQEYEDPEEEEIEDAAPREPEDEENPQNPTNLPPEDEDDLDQPMCVADELITEQRKKDVVNMHPYALDYDYDSEKFLWCKLTFWLPLRMCRLDLPTILRTVAEKVVLWETPAIKRAFTFQNSEGETILKTDGLNIVEMFKYAQILDLHKLYTNDIYGVSRTYGIEAANRVILKEVKDVFKMYGITVDSRHLSLIADYMTFDGTFQPLSRKGMEDSASPLQQMSFEASLNFLKNATLQGKHDDLMSPSSRLMVGQPCKTGTGAFNVLFKMNNTAVSM
有人可以帮助我吗? 谢谢你