#!/usr/bin/perl -w
use strict;
use Bio::AlignIO;
# Creating a new AlignIO object
my $in = Bio::AlignIO->new( '-file' => "seq.fas" , '-format' => "fasta" );
# And another one for output
my $out = Bio::AlignIO->new( '-file' => ">seq.nxs", '-format' => "nexus" );
while ( my $aln = $in->next_aln() ) {
$out->write_aln($aln);
}
我想重写此脚本,以便可以从命令行指定文件名以及格式。
输入文件是Fasta,我想将其转换为Phylip。
到目前为止,我已经做到了:
#!/usr/bin/perl -w
use strict;
use Bio::AlignIO; #this is how you include a perl module in your program
# read from standard input
my $str = $ARGV[0];
my $for = $ARGV[1];
my $in = Bio::AlignIO->new( '-file' => $str, '-format' => $for);
my $out = Bio::AlignIO->new( '-file' => ">output.phy", '-format' => "phylip" );
# Write to $out while there's still stuff at $in
while ( my $aln = $in->next_aln() ) {
$out->write_aln($aln);
}
这是输出的一部分:
16 1582
gi|4559793 GCAAATATTT AGCCCAACAT GAATCCAAAA AAAAATATTA AACAAATAAA ACCAAAACAT
gi|9448114 GCCAAATAAA GCCCTCTTAC CCCAAAAATC TTAACTATTC AAATTTTCCT ACAAAACATT
gi|8330908 GCTAAYCTAA GCCCCGCTAA TTATTTATAY AACTAATTAA CTAAATCAAA AAAAAACATT
gi|1087933 GCCAATATAA GCCCATCCAC TAACTAATAC AACTATATTA AATTTCTACC AAAACATTCA
gi|1155320 GCTAACCTAA GCCCGCACAA CTATTAAATT AACTATTTTC ATTACTAATC TAAACCATTT
gi|1087932 GCCAATCAAA AGCCTCCCCA CCTCAAGAAT CAATTATTTA AATTATTCAC CAAAACATTT
gi|1087935 GCCAATAATA GCCCCAACAC CCACCAAAAA CAATTATTAC AAATAACCAA ACAAAACATT
gi|1087933 GCCAATATAA GCCCATCCAC TAACTAATAC AACTATATTA AATTTCTACC AAAACATTCA
gi|8330902 GCGAGACTTA GTCCTAGCAC ACAACTATTT AACTATTTTA CCTAATCTTC AAAACATTAA
gi|1807789 GCCAATAAAA GCCCATCCAC CCACCAACAC AATTATATCA AATCTTAACG AAAACATTAA
gi|4559822 GCTAATATTA GCCCTACATG AATCCAAAAA AAAATATTTA ATAATCAATC TCAAAACATT
gi|1367680 GCCAATTTTA GCCCTACCAA TATCTCACAA AACTAACTTA CCTGTTTTTT AAAAAACATT
gi|9448117 GCCAATATAA AGCTCCATTC ATACAAAAAT AAAATATTAT TACTAATCAA TCAAATCATT
gi|5639854 GCTAATTTAA AGCCCCAACA ACTATTCATA AAATTAATTT ATCTACTAAA AAAACATTAA
gi|4559833 CAATCATAGC CCTAACTATC CCCAACTAAA CTAATCATAA CTCTAACTAA AACATTTTCT
gi|5835470 TCCAATAGAT CGTTTATTTA TTTACCCTCT TGTTATTTCG AAACATCCAC CACCCTTAAT
TTAATCATTT AAGTATAGGT GATAGAAAAT TTTATACGGG CGCTATAACG TAAGTACCGC
TCTCCTCCTA GTATAGGTGA TAGAACAGAC AAACAGGAGC AATAACGCTA GTACCGCAAG
CATTTATCAT AGTATAGGTG ATAGAAAAGA TTAATWGGGA GCCATAACRA AAGTACCGYA
ATTCATCCTA GTATAGGAGA TAGAACAGAT AAAAAGGCGC ATTAATGATA GTACCGCAAG
ACCCTATCAT AGTATAGGCG ATAGAACAGA TACCCAGGCG CAATAACGCA CAGTACCGTA
ACAAATCCAA GTATAGGTGA TAGAAAAGAT CACATGAGCG CAATAGCTAT AAGTACCGCA
TATTCATCCA AGTATAGGTG ATAGAACAGA CACACAGGAG CAATATATAC TAGTACCGTA
ATTCATCCTA GTATAGGAGA TAGAACAGAT AAAAAGGCGC ATTAATGATA GTACCGCAAG
CCTTTATCCT AGTATAGGTG ATAGAACAGA TACTTATAGG CACAATAACG AAAGTACCGT
TTTAATCACA GTATAGGTGA TAGAACAGAT AAATAGGCGC ATTAATGTAA GTACCGCAAG
TAATTAATCA AGTATAGGTG ATAGAAAGAT TTGTACAGGC GCTATAATGT AAGTACCGTA
TTACTTATCA TAGTATAGGT GATAGAAAAG ATACTTAAGG AGCCATAACG AAAGTACCGC
TACCCAAATC TTAGTATAGG TGATAGAAAA GATTACTAGG ACGCTATAAT GTAAGTACCG
CTAATCATAG TATAGGTGAT AGAAAAGATA CTAATGGAGC TATAACGAAG AGTACCGTAA
TGATCTTAGT ATAGGAGATA GAAAAGATAA CCTAGGCGCC ATAACATTAG TACCGCAAGG
CCCCTGAAAG TAAAGGCGAT TGAACCCAGT AAAACTGCAA GTACCGTAAG GGAACTATGA
为什么要获得这些列?看起来部分序列会跳下来并变得混乱。输出文件应如下所示:
gi|4559793GCAAATATTTAGCCCAACATGAATCCAAAAAAAAATATTAAACAAATAAA
gi|9448114GCCAAATAAAGCCCTCTTACCCCAAAAATCTTAACTATTCAAATTTTCCT
gi|8330908GCTAAYCTAAGCCCCGCTAATTATTTATAYAACTAATTAACTAAATCAAA
gi|1087933GCCAATATTAGCCCATTCAACTATTAAACCAAATATCACAATCACTTAAT
gi|1155320GCTAACCTAAGCCCGCACAACTATTAAATTAACTATTTTCATTACTAATC
gi|1087932GCCAATCAAAAGCCTCCCCACCTCAAGAATCAATTATTTAAATTATTCAC
gi|1087935GCCAATAATAGCCCCAACACCCACCAAAAACAATTATTACAAATAACCAA
gi|1087933GCCAATATAAGCCCATCCACTAACTAATACAACTATATTAAATTTCTACC
gi|8330902GCGAGACTTAGTCCTAGCACACAACTATTTAACTATTTTACCTAATCTTC
gi|1807789GCCAATAAAAGCCCATCCACCCACCAACACAATTATATCAAATCTTAACG
gi|4559822GCTAATATTAGCCCTACATGAATCCAAAAAAAAATATTTAATAATCAATC
gi|1367680GCCAATTTTAGCCCTACCAATATCTCACAAAACTAACTTACCTGTTTTTT
gi|9448117GCCAATATAAAGCTCCATTCATACAAAAATAAAATATTATTACTAATCAA
gi|5639854GCTAATTTAAAGCCCCAACAACTATTCATAAAATTAATTTATCTACTAAA
gi|4559833CAATCATAGCCCTAACTATCCCCAACTAAACTAATCATAACTCTAACTAA
gi|5835470TCCAATAGATCGTTTATTTATTTACCCTCTTGTTATTTCGAAACATCCAC
ACCAAAACATTTAATCATTTAAGTATAGGTGATAGAAAATTTTATACGGG
ACAAAACATTTCTCCTCCTAGTATAGGTGATAGAACAGACAAACAGGAGC
AAAAAACATTCATTTATCATAGTATAGGTGATAGAAAAGATTAATWGGGA
CTAAACCATTTACTCTGTCCAAGTATAGGTGATAGAAAAGACTAATCCAG
TAAACCATTTACCCTATCATAGTATAGGCGATAGAACAGATACCCAGGCG
CAAAACATTTACAAATCCAAGTATAGGTGATAGAAAAGATCACATGAGCG
ACAAAACATTTATTCATCCAAGTATAGGTGATAGAACAGACACACAGGAG
AAAACATTCAATTCATCCTAGTATAGGAGATAGAACAGATAAAAAGGCGC
AAAACATTAACCTTTATCCTAGTATAGGTGATAGAACAGATACTTATAGG
AAAACATTAATTTAATCACAGTATAGGTGATAGAACAGATAAATAGGCGC
TCAAAACATTTAATTAATCAAGTATAGGTGATAGAAAGATTTGTACAGGC
AAAAAACATTTTACTTATCATAGTATAGGTGATAGAAAAGATACTTAAGG
TCAAATCATTTACCCAAATCTTAGTATAGGTGATAGAAAAGATTACTAGG
AAAACATTAACTAATCATAGTATAGGTGATAGAAAAGATACTAATGGAGC
AACATTTTCTTGATCTTAGTATAGGAGATAGAAAAGATAACCTAGGCGCC
CACCCTTAATCCCCTGAAAGTAAAGGCGATTGAACCCAGTAAAACTGCAA