我是新来的,所以我为我糟糕的英语道歉。 我有2个文件(文件1:main-XML-file和文件2:description-file),我希望将每行的描述行集成到XML文件中的特定位置(替换Hit_def中的XX)。
文件1: 这是xml-tree:
<BlastOutput>
<BlastOutput_iterations>
<Iteration> (gene 1)
<Iteration_hits>
<Hit> (1-10)
<Hit_def>
<Iteration> (gene 2)
<Iteration_hits>
<Hit> (1-10)
<Hit_def>
这里是第一行和最后一行,因为文件大5 GB:
<?xmlversion="1.0"?>
<BlastOutput>
<BlastOutput_program>RAPSearch</BlastOutput_program>
<BlastOutput_version>RAPSearch2</BlastOutput_version>
<BlastOutput_reference>YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126</BlastOutput_reference>
<BlastOutput_db>/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15</BlastOutput_db>
<BlastOutput_param>
<Parameters>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-def>gene_id_1</Iteration_query-def>
<Iteration_query-len>37</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|939543432|gb|KPV42113.1|</Hit_id>
<Hit_def>XX</Hit_def>
<Hit_accession>KPV42113.1</Hit_accession>
<Hit_len>162</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>58.151</Hsp_bit-score>
<Hsp_score>139</Hsp_score>
<Hsp_evalue>-5.6061</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>54</Hsp_hit-from>
<Hsp_hit-to>90</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>28</Hsp_identity>
<Hsp_positive>33</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|385280362|gb|EIF44286.1|</Hit_id>
<Hit_def>XX</Hit_def>
<Hit_accession>EIF44286.1</Hit_accession>
<Hit_len>327</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>54.6842</Hsp_bit-score>
<Hsp_score>130</Hsp_score>
<Hsp_evalue>-4.56249</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>169</Hsp_hit-from>
<Hsp_hit-to>205</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>24</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|550913550|ref|WP_022666548.1|</Hit_id>
<Hit_def>XX</Hit_def>
<Hit_accession>WP_022666548.1</Hit_accession>
<Hit_len>721</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.5286</Hsp_bit-score>
<Hsp_score>127</Hsp_score>
<Hsp_evalue>-4.21462</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>549</Hsp_hit-from>
<Hsp_hit-to>585</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>27</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>MVICDEPVSALDVSVQAAVLNLLNEIKEEMGTTMIFI</Hsp_hseq>
<Hsp_midline>MV+CDEPVSALDVSVQAAVLLLEI+++TMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
...
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>77704984</Statistics_db-num>
<Statistics_db-len>28292933896</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
文件2:
peptide ABC transporter ATPase, partial [Kouleothrix aurantiaca]
oligopeptide ABC transporter [gamma proteobacterium BDW918]
ABC transporter ATP-binding protein [Desulfospira joergensenii]
输出应该是:
<?xmlversion="1.0"?>
<BlastOutput>
<BlastOutput_program>RAPSearch</BlastOutput_program>
<BlastOutput_version>RAPSearch2</BlastOutput_version>
<BlastOutput_reference>YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126</BlastOutput_reference>
<BlastOutput_db>/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15</BlastOutput_db>
<BlastOutput_param>
<Parameters>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-def>gene_id_1</Iteration_query-def>
<Iteration_query-len>37</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|939543432|gb|KPV42113.1|</Hit_id>
<Hit_def>peptide ABC transporter ATPase, partial [Kouleothrix aurantiaca]</Hit_def>
<Hit_accession>KPV42113.1</Hit_accession>
<Hit_len>162</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>58.151</Hsp_bit-score>
<Hsp_score>139</Hsp_score>
<Hsp_evalue>-5.6061</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>54</Hsp_hit-from>
<Hsp_hit-to>90</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>28</Hsp_identity>
<Hsp_positive>33</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|385280362|gb|EIF44286.1|</Hit_id>
<Hit_def>oligopeptide ABC transporter [gamma proteobacterium BDW918]</Hit_def>
<Hit_accession>EIF44286.1</Hit_accession>
<Hit_len>327</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>54.6842</Hsp_bit-score>
<Hsp_score>130</Hsp_score>
<Hsp_evalue>-4.56249</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>169</Hsp_hit-from>
<Hsp_hit-to>205</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>24</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|550913550|ref|WP_022666548.1|</Hit_id>
<Hit_def>ABC transporter ATP-binding protein [Desulfospira joergensenii]</Hit_def>
<Hit_accession>WP_022666548.1</Hit_accession>
<Hit_len>721</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.5286</Hsp_bit-score>
<Hsp_score>127</Hsp_score>
<Hsp_evalue>-4.21462</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>549</Hsp_hit-from>
<Hsp_hit-to>585</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>27</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>MVICDEPVSALDVSVQAAVLNLLNEIKEEMGTTMIFI</Hsp_hseq>
<Hsp_midline>MV+CDEPVSALDVSVQAAVLLLEI+++TMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
...
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>77704984</Statistics_db-num>
<Statistics_db-len>28292933896</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
编写脚本的第一次试验没有结果,并且是灾难性的。所以我希望有人可以帮助我。
答案 0 :(得分:0)
我刚刚更新了脚本以匹配上面新XML的新XML结构。
在下面的代码中查看我的评论:
use strict;
use warnings;
use XML::Simple;
#First, parse your XML into a hash
open my $MF1,'<', 'my_xml.xml';
my $xml = XMLin($MF1);
close $MF1;
=com This the $xml sample
$VAR1 = {
'BlastOutput_db' => '/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15',
'BlastOutput_program' => 'RAPSearch',
'BlastOutput_param' => {
'Parameters' => {}
},
'BlastOutput_reference' => 'YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126',
'BlastOutput_version' => 'RAPSearch2',
'BlastOutput_iterations' => {
'Iteration' => {
'Iteration_hits' => {
'Hit' => [
{
'Hit_accession' => 'KPV42113.1',
'Hit_id' => 'gi|939543432|gb|KPV42113.1|',
'Hit_hsps' => {
'Hsp' => {
'Hsp_hseq' => 'LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI',
'Hsp_bit-score' => '58.151',
'Hsp_identity' => '28',
'Hsp_align-len' => '37',
'Hsp_query-frame' => '0',
'Hsp_query-from' => '1',
'Hsp_qseq' => 'MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI',
'Hsp_evalue' => '-5.6061',
'Hsp_midline' => '+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII',
'Hsp_num' => '1',
'Hsp_positive' => '33',
'Hsp_hit-from' => '54',
'Hsp_score' => '139',
'Hsp_hit-to' => '90',
'Hsp_query-to' => '37'
}
},
'Hit_len' => '162',
'Hit_num' => '1',
'Hit_def' => 'XX'
},
{
'Hit_accession' => 'EIF44286.1',
'Hit_id' => 'gi|385280362|gb|EIF44286.1|',
'Hit_hsps' => {
'Hsp' => {
'Hsp_hit-from' => '169',
'Hsp_positive' => '31',
'Hsp_score' => '130',
'Hsp_query-to' => '37',
'Hsp_hit-to' => '205',
'Hsp_num' => '1',
'Hsp_midline' => '+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I',
'Hsp_align-len' => '37',
'Hsp_query-frame' => '0',
'Hsp_qseq' => 'MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI',
'Hsp_evalue' => '-4.56249',
'Hsp_query-from' => '1',
'Hsp_bit-score' => '54.6842',
'Hsp_identity' => '24',
'Hsp_hseq' => 'LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI'
}
},
'Hit_def' => 'XX',
'Hit_len' => '327',
'Hit_num' => '2'
},
=cut
# Save the second file into an array
open my $MF2, '<', 'file2';
chomp( my @defs = <$MF2> );
close $MF2;
# Update the XML hash
foreach my $iteration ( @{ $xml->{'BlastOutput_iterations'}{'Iteration'}}){
foreach my $hit ( @{$iteration->{'Iteration_hits'}{'Hit'}}){
$hit->{'Hit_def'} = @defs[ $hit->{'Hit_num'} - 1 ];
}}
# Write the new XML to file1
open my $MF1_new, '>', 'my_xml.xml';
XMLout($xml, OutputFile => $MF1_new, NoAttr => 1, RootName => 'BlastOutput' );
close $MF1_new;
答案 1 :(得分:0)
最后我明白了。只有输出有点偏移:
<BlastOutput>
<BlastOutput_db>/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15</BlastOutput_db>
<BlastOutput_iterations>
<name>Iteration</name>
<Iteration_hits>
<Hit>
<Hit_accession>KPV42113.1</Hit_accession>
<Hit_def>peptide ABC transporter ATPase, partial [Kouleothrix aurantiaca]
</Hit_def>
<Hit_hsps>
<name>Hsp</name>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_bit-score>58.151</Hsp_bit-score>
<Hsp_evalue>-5.6061</Hsp_evalue>
<Hsp_hit-from>54</Hsp_hit-from>
<Hsp_hit-to>90</Hsp_hit-to>
<Hsp_hseq>LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI</Hsp_hseq>
<Hsp_identity>28</Hsp_identity>
<Hsp_midline>+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII</Hsp_midline>
<Hsp_num>1</Hsp_num>
<Hsp_positive>33</Hsp_positive>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_score>139</Hsp_score>
</Hit_hsps>
<Hit_id>gi|939543432|gb|KPV42113.1|</Hit_id>
<Hit_len>162</Hit_len>
<Hit_num>1</Hit_num>
</Hit>
<Hit>
<Hit_accession>EIF44286.1</Hit_accession>
<Hit_def>oligopeptide ABC transporter [gamma proteobacterium BDW918]
</Hit_def>
<Hit_hsps>
<name>Hsp</name>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_bit-score>54.6842</Hsp_bit-score>
<Hsp_evalue>-4.56249</Hsp_evalue>
<Hsp_hit-from>169</Hsp_hit-from>
<Hsp_hit-to>205</Hsp_hit-to>
<Hsp_hseq>LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI</Hsp_hseq>
<Hsp_identity>24</Hsp_identity>
<Hsp_midline>+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I</Hsp_midline>
<Hsp_num>1</Hsp_num>
<Hsp_positive>31</Hsp_positive>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_score>130</Hsp_score>
</Hit_hsps>
<Hit_id>gi|385280362|gb|EIF44286.1|</Hit_id>
<Hit_len>327</Hit_len>
<Hit_num>2</Hit_num>
</Hit>
<Hit>
<Hit_accession>WP_022666548.1</Hit_accession>
<Hit_def>ABC transporter ATP-binding protein [Desulfospira joergensenii]
</Hit_def>
<Hit_hsps>
<name>Hsp</name>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_bit-score>53.5286</Hsp_bit-score>
<Hsp_evalue>-4.21462</Hsp_evalue>
<Hsp_hit-from>549</Hsp_hit-from>
<Hsp_hit-to>585</Hsp_hit-to>
<Hsp_hseq>MVICDEPVSALDVSVQAAVLNLLNEIKEEMGTTMIFI</Hsp_hseq>
<Hsp_identity>27</Hsp_identity>
<Hsp_midline>MV+CDEPVSALDVSVQAAVLLLEI+++TMII</Hsp_midline>
<Hsp_num>1</Hsp_num>
<Hsp_positive>31</Hsp_positive>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_score>127</Hsp_score>
</Hit_hsps>
<Hit_id>gi|550913550|ref|WP_022666548.1|</Hit_id>
<Hit_len>721</Hit_len>
<Hit_num>3</Hit_num>
</Hit>
</Iteration_hits>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-def>gene_id_1</Iteration_query-def>
<Iteration_query-len>37</Iteration_query-len>
<Iteration_stat>
<name>Statistics</name>
<Statistics_db-len>28292933896</Statistics_db-len>
<Statistics_db-num>77704984</Statistics_db-num>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_entropy>0.14</Statistics_entropy>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
</Iteration_stat>
</BlastOutput_iterations>
<BlastOutput_param>
<name>Parameters</name>
</BlastOutput_param>
<BlastOutput_program>RAPSearch</BlastOutput_program>
<BlastOutput_reference>YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126</BlastOutput_reference>
<BlastOutput_version>RAPSearch2</BlastOutput_version>
</BlastOutput>