将Perl中另一个XML文件中的一个文件的描述集成在一起

时间:2016-01-27 08:38:37

标签: xml perl

我是新来的,所以我为我糟糕的英语道歉。 我有2个文件(文件1:main-XML-file和文件2:description-file),我希望将每行的描述行集成到XML文件中的特定位置(替换Hit_def中的XX)。

文件1: 这是xml-tree:

 <BlastOutput>
    <BlastOutput_iterations>
        <Iteration> (gene 1)
            <Iteration_hits>
                <Hit> (1-10)
                    <Hit_def>
        <Iteration> (gene 2)
            <Iteration_hits>
                <Hit> (1-10)
                    <Hit_def>

这里是第一行和最后一行,因为文件大5 GB:

<?xmlversion="1.0"?>
<BlastOutput>
<BlastOutput_program>RAPSearch</BlastOutput_program>
<BlastOutput_version>RAPSearch2</BlastOutput_version>
<BlastOutput_reference>YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126</BlastOutput_reference>
<BlastOutput_db>/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15</BlastOutput_db>
<BlastOutput_param>
<Parameters>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-def>gene_id_1</Iteration_query-def>
<Iteration_query-len>37</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|939543432|gb|KPV42113.1|</Hit_id>
<Hit_def>XX</Hit_def>
<Hit_accession>KPV42113.1</Hit_accession>
<Hit_len>162</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>58.151</Hsp_bit-score>
<Hsp_score>139</Hsp_score>
<Hsp_evalue>-5.6061</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>54</Hsp_hit-from>
<Hsp_hit-to>90</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>28</Hsp_identity>
<Hsp_positive>33</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|385280362|gb|EIF44286.1|</Hit_id>
<Hit_def>XX</Hit_def>
<Hit_accession>EIF44286.1</Hit_accession>
<Hit_len>327</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>54.6842</Hsp_bit-score>
<Hsp_score>130</Hsp_score>
<Hsp_evalue>-4.56249</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>169</Hsp_hit-from>
<Hsp_hit-to>205</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>24</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|550913550|ref|WP_022666548.1|</Hit_id>
<Hit_def>XX</Hit_def>
<Hit_accession>WP_022666548.1</Hit_accession>
<Hit_len>721</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.5286</Hsp_bit-score>
<Hsp_score>127</Hsp_score>
<Hsp_evalue>-4.21462</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>549</Hsp_hit-from>
<Hsp_hit-to>585</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>27</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>MVICDEPVSALDVSVQAAVLNLLNEIKEEMGTTMIFI</Hsp_hseq>
<Hsp_midline>MV+CDEPVSALDVSVQAAVLLLEI+++TMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
...
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>77704984</Statistics_db-num>
<Statistics_db-len>28292933896</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>

文件2:

peptide ABC transporter ATPase, partial [Kouleothrix aurantiaca]
oligopeptide ABC transporter [gamma proteobacterium BDW918]
ABC transporter ATP-binding protein [Desulfospira joergensenii]
输出应该是:

<?xmlversion="1.0"?>
<BlastOutput>
<BlastOutput_program>RAPSearch</BlastOutput_program>
<BlastOutput_version>RAPSearch2</BlastOutput_version>
<BlastOutput_reference>YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126</BlastOutput_reference>
<BlastOutput_db>/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15</BlastOutput_db>
<BlastOutput_param>
<Parameters>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-def>gene_id_1</Iteration_query-def>
<Iteration_query-len>37</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|939543432|gb|KPV42113.1|</Hit_id>
<Hit_def>peptide ABC transporter ATPase, partial [Kouleothrix aurantiaca]</Hit_def>
<Hit_accession>KPV42113.1</Hit_accession>
<Hit_len>162</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>58.151</Hsp_bit-score>
<Hsp_score>139</Hsp_score>
<Hsp_evalue>-5.6061</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>54</Hsp_hit-from>
<Hsp_hit-to>90</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>28</Hsp_identity>
<Hsp_positive>33</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|385280362|gb|EIF44286.1|</Hit_id>
<Hit_def>oligopeptide ABC transporter [gamma proteobacterium BDW918]</Hit_def>
<Hit_accession>EIF44286.1</Hit_accession>
<Hit_len>327</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>54.6842</Hsp_bit-score>
<Hsp_score>130</Hsp_score>
<Hsp_evalue>-4.56249</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>169</Hsp_hit-from>
<Hsp_hit-to>205</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>24</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI</Hsp_hseq>
<Hsp_midline>+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>3</Hit_num>
<Hit_id>gi|550913550|ref|WP_022666548.1|</Hit_id>
<Hit_def>ABC transporter ATP-binding protein [Desulfospira joergensenii]</Hit_def>
<Hit_accession>WP_022666548.1</Hit_accession>
<Hit_len>721</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.5286</Hsp_bit-score>
<Hsp_score>127</Hsp_score>
<Hsp_evalue>-4.21462</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>37</Hsp_query-to>
<Hsp_hit-from>549</Hsp_hit-from>
<Hsp_hit-to>585</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_identity>27</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_align-len>37</Hsp_align-len>
<Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
<Hsp_hseq>MVICDEPVSALDVSVQAAVLNLLNEIKEEMGTTMIFI</Hsp_hseq>
<Hsp_midline>MV+CDEPVSALDVSVQAAVLLLEI+++TMII</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
...
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>77704984</Statistics_db-num>
<Statistics_db-len>28292933896</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>

编写脚本的第一次试验没有结果,并且是灾难性的。所以我希望有人可以帮助我。

2 个答案:

答案 0 :(得分:0)

我刚刚更新了脚本以匹配上面新XML的新XML结构。

在下面的代码中查看我的评论:

    use strict;                                                                                                                                                                                      
    use warnings;
    use XML::Simple;

    #First, parse your XML into a hash
    open my $MF1,'<', 'my_xml.xml';
    my $xml = XMLin($MF1);
    close $MF1;

    =com    This the $xml sample
    $VAR1 = {
        'BlastOutput_db' => '/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15',
        'BlastOutput_program' => 'RAPSearch',
        'BlastOutput_param' => {
        'Parameters' => {}
        },
        'BlastOutput_reference' => 'YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126',
        'BlastOutput_version' => 'RAPSearch2',
        'BlastOutput_iterations' => {
            'Iteration' => {
                'Iteration_hits' => {
                    'Hit' => [
                        {
                        'Hit_accession' => 'KPV42113.1',
                        'Hit_id' => 'gi|939543432|gb|KPV42113.1|',
                        'Hit_hsps' => {
                        'Hsp' => {
                        'Hsp_hseq' => 'LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI',
                        'Hsp_bit-score' => '58.151',
                        'Hsp_identity' => '28',
                        'Hsp_align-len' => '37',
                        'Hsp_query-frame' => '0',
                        'Hsp_query-from' => '1',
                        'Hsp_qseq' => 'MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI',
                        'Hsp_evalue' => '-5.6061',
                        'Hsp_midline' => '+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII',
                        'Hsp_num' => '1',
                        'Hsp_positive' => '33',
                        'Hsp_hit-from' => '54',
                        'Hsp_score' => '139',
                        'Hsp_hit-to' => '90',
                        'Hsp_query-to' => '37'
                        }
                        },
                    'Hit_len' => '162',
                    'Hit_num' => '1',
                    'Hit_def' => 'XX'
                    },
                    {
                    'Hit_accession' => 'EIF44286.1',
                    'Hit_id' => 'gi|385280362|gb|EIF44286.1|',
                    'Hit_hsps' => {
                    'Hsp' => {
                    'Hsp_hit-from' => '169',
                    'Hsp_positive' => '31',
                    'Hsp_score' => '130',
                    'Hsp_query-to' => '37',
                    'Hsp_hit-to' => '205',
                    'Hsp_num' => '1',
                    'Hsp_midline' => '+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I',
                    'Hsp_align-len' => '37',
                    'Hsp_query-frame' => '0',
                    'Hsp_qseq' => 'MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI',
                    'Hsp_evalue' => '-4.56249',
                    'Hsp_query-from' => '1',
                    'Hsp_bit-score' => '54.6842',
                    'Hsp_identity' => '24',
                    'Hsp_hseq' => 'LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI'
                    }
                    },
                    'Hit_def' => 'XX',
                    'Hit_len' => '327',
                    'Hit_num' => '2'
                    },

=cut

# Save the second file into an array
open my $MF2, '<', 'file2';
chomp( my @defs = <$MF2> );
close $MF2;

# Update the XML hash
foreach my $iteration ( @{ $xml->{'BlastOutput_iterations'}{'Iteration'}}){ 
foreach my $hit ( @{$iteration->{'Iteration_hits'}{'Hit'}}){    
$hit->{'Hit_def'} = @defs[ $hit->{'Hit_num'} - 1 ];
}}

# Write the new XML to file1
open my $MF1_new, '>', 'my_xml.xml';
XMLout($xml, OutputFile => $MF1_new, NoAttr => 1, RootName => 'BlastOutput' );
close $MF1_new;  

答案 1 :(得分:0)

最后我明白了。只有输出有点偏移:

<BlastOutput>
  <BlastOutput_db>/mreferate/dwolff/RAPSearch2.23/db/NCBI_nr_dec15</BlastOutput_db>
  <BlastOutput_iterations>
    <name>Iteration</name>
    <Iteration_hits>
      <Hit>
        <Hit_accession>KPV42113.1</Hit_accession>
        <Hit_def>peptide ABC transporter ATPase, partial [Kouleothrix aurantiaca]
</Hit_def>
        <Hit_hsps>
          <name>Hsp</name>
          <Hsp_align-len>37</Hsp_align-len>
          <Hsp_bit-score>58.151</Hsp_bit-score>
          <Hsp_evalue>-5.6061</Hsp_evalue>
          <Hsp_hit-from>54</Hsp_hit-from>
          <Hsp_hit-to>90</Hsp_hit-to>
          <Hsp_hseq>LVLCDEPVSALDVSVQAAVLNLLLEIQREHGTTMIFI</Hsp_hseq>
          <Hsp_identity>28</Hsp_identity>
          <Hsp_midline>+V+CDEPVSALDVSVQAAVLLL+EIQ++HTMII</Hsp_midline>
          <Hsp_num>1</Hsp_num>
          <Hsp_positive>33</Hsp_positive>
          <Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
          <Hsp_query-frame>0</Hsp_query-frame>
          <Hsp_query-from>1</Hsp_query-from>
          <Hsp_query-to>37</Hsp_query-to>
          <Hsp_score>139</Hsp_score>
        </Hit_hsps>
        <Hit_id>gi|939543432|gb|KPV42113.1|</Hit_id>
        <Hit_len>162</Hit_len>
        <Hit_num>1</Hit_num>
      </Hit>
      <Hit>
        <Hit_accession>EIF44286.1</Hit_accession>
        <Hit_def>oligopeptide ABC transporter [gamma proteobacterium BDW918]
</Hit_def>
        <Hit_hsps>
          <name>Hsp</name>
          <Hsp_align-len>37</Hsp_align-len>
          <Hsp_bit-score>54.6842</Hsp_bit-score>
          <Hsp_evalue>-4.56249</Hsp_evalue>
          <Hsp_hit-from>169</Hsp_hit-from>
          <Hsp_hit-to>205</Hsp_hit-to>
          <Hsp_hseq>LVICDEPVSALDVSVQAQIINLLQELQTEHNTAMLFI</Hsp_hseq>
          <Hsp_identity>24</Hsp_identity>
          <Hsp_midline>+V+CDEPVSALDVSVQA++LLE+Q+HTAM+I</Hsp_midline>
          <Hsp_num>1</Hsp_num>
          <Hsp_positive>31</Hsp_positive>
          <Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
          <Hsp_query-frame>0</Hsp_query-frame>
          <Hsp_query-from>1</Hsp_query-from>
          <Hsp_query-to>37</Hsp_query-to>
          <Hsp_score>130</Hsp_score>
        </Hit_hsps>
        <Hit_id>gi|385280362|gb|EIF44286.1|</Hit_id>
        <Hit_len>327</Hit_len>
        <Hit_num>2</Hit_num>
      </Hit>
      <Hit>
        <Hit_accession>WP_022666548.1</Hit_accession>
        <Hit_def>ABC transporter ATP-binding protein [Desulfospira joergensenii]
</Hit_def>
        <Hit_hsps>
          <name>Hsp</name>
          <Hsp_align-len>37</Hsp_align-len>
          <Hsp_bit-score>53.5286</Hsp_bit-score>
          <Hsp_evalue>-4.21462</Hsp_evalue>
          <Hsp_hit-from>549</Hsp_hit-from>
          <Hsp_hit-to>585</Hsp_hit-to>
          <Hsp_hseq>MVICDEPVSALDVSVQAAVLNLLNEIKEEMGTTMIFI</Hsp_hseq>
          <Hsp_identity>27</Hsp_identity>
          <Hsp_midline>MV+CDEPVSALDVSVQAAVLLLEI+++TMII</Hsp_midline>
          <Hsp_num>1</Hsp_num>
          <Hsp_positive>31</Hsp_positive>
          <Hsp_qseq>MVVCDEPVSALDVSVQAAVLTLLVEIQQQHETAMILI</Hsp_qseq>
          <Hsp_query-frame>0</Hsp_query-frame>
          <Hsp_query-from>1</Hsp_query-from>
          <Hsp_query-to>37</Hsp_query-to>
          <Hsp_score>127</Hsp_score>
        </Hit_hsps>
        <Hit_id>gi|550913550|ref|WP_022666548.1|</Hit_id>
        <Hit_len>721</Hit_len>
        <Hit_num>3</Hit_num>
      </Hit>
    </Iteration_hits>
    <Iteration_iter-num>1</Iteration_iter-num>
    <Iteration_query-def>gene_id_1</Iteration_query-def>
    <Iteration_query-len>37</Iteration_query-len>
    <Iteration_stat>
      <name>Statistics</name>
      <Statistics_db-len>28292933896</Statistics_db-len>
      <Statistics_db-num>77704984</Statistics_db-num>
      <Statistics_eff-space>0</Statistics_eff-space>
      <Statistics_entropy>0.14</Statistics_entropy>
      <Statistics_hsp-len>0</Statistics_hsp-len>
      <Statistics_kappa>0.041</Statistics_kappa>
      <Statistics_lambda>0.267</Statistics_lambda>
    </Iteration_stat>
  </BlastOutput_iterations>
  <BlastOutput_param>
    <name>Parameters</name>
  </BlastOutput_param>
  <BlastOutput_program>RAPSearch</BlastOutput_program>
  <BlastOutput_reference>YonganZhao,HaixuTangandYuzhenYe.RAPSearch2:afastandmemory-efficientproteinsimilaritysearchtoolfornextgenerationsequencingdata.Bioinformatics2012,28(1):125-126</BlastOutput_reference>
  <BlastOutput_version>RAPSearch2</BlastOutput_version>
</BlastOutput>