从BLAST XML中过滤重复项

时间:2016-07-02 22:48:52

标签: xml xslt

我想使用字段" Hsp_query-from"从长BLAST输出文件中过滤掉重复的条目。即我的XML输出文件是根据这个字段排序的,我想只选择每个唯一的第一个条目" Hsp_query-from"值。此外,这应该为" Hsp_num" 1,并且分别用于" Hsp_num" 2.我的示例输入文件如下所示:

   <?xml version="1.0" encoding="UTF-8" ?>
   <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
    <BlastOutput>
      <BlastOutput_program>blastn</BlastOutput_program>
    <BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version>
      <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
      <BlastOutput_db>ABC</BlastOutput_db>
      <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
      <BlastOutput_query-def>m151221</BlastOutput_query-def>
      <BlastOutput_query-len>1790</BlastOutput_query-len>
      <BlastOutput_param>
        <Parameters>
          <Parameters_expect>0.001</Parameters_expect>
          <Parameters_sc-match>1</Parameters_sc-match>
          <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
          <Parameters_gap-open>0</Parameters_gap-open>
          <Parameters_gap-extend>0</Parameters_gap-extend>
          <Parameters_filter>L;m;</Parameters_filter>
        </Parameters>
      </BlastOutput_param>
      <BlastOutput_iterations>
        <Iteration>
          <Iteration_iter-num>1</Iteration_iter-num>
          <Iteration_query-ID>Query_1</Iteration_query-ID>
          <Iteration_query-def>m151221</Iteration_query-def>
          <Iteration_query-len>1790</Iteration_query-len>
          <Iteration_hits>
            <Hit>
              <Hit_num>14</Hit_num>
              <Hit_id>A1</Hit_id>
              <Hit_def>A1-def</Hit_def>
              <Hit_accession>A1</Hit_accession>
              <Hit_len>249</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>130.386</Hsp_bit-score>
                  <Hsp_score>70</Hsp_score>
                  <Hsp_evalue>5.24249e-32</Hsp_evalue>
                  <Hsp_query-from>1</Hsp_query-from>
                  <Hsp_query-to>73</Hsp_query-to>
                  <Hsp_hit-from>74</Hsp_hit-from>
                  <Hsp_hit-to>1</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>73</Hsp_identity>
                  <Hsp_positive>73</Hsp_positive>
                  <Hsp_gaps>1</Hsp_gaps>
                  <Hsp_align-len>74</Hsp_align-len>
                  <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
                  <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
                  <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
            <Hit>
              <Hit_num>15</Hit_num>
              <Hit_id>D1</Hit_id>
              <Hit_def>D1-def</Hit_def>
              <Hit_accession>D1</Hit_accession>
              <Hit_len>261</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>130.386</Hsp_bit-score>
                  <Hsp_score>70</Hsp_score>
                  <Hsp_evalue>5.24249e-32</Hsp_evalue>
                  <Hsp_query-from>1</Hsp_query-from>
                  <Hsp_query-to>73</Hsp_query-to>
                  <Hsp_hit-from>80</Hsp_hit-from>
                  <Hsp_hit-to>7</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>73</Hsp_identity>
                  <Hsp_positive>73</Hsp_positive>
                  <Hsp_gaps>1</Hsp_gaps>
                  <Hsp_align-len>74</Hsp_align-len>
                  <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
                  <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
                  <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
            <Hit>
              <Hit_num>16</Hit_num>
              <Hit_id>B1</Hit_id>
              <Hit_def>B1-def</Hit_def>
              <Hit_accession>B1</Hit_accession>
              <Hit_len>253</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>130.386</Hsp_bit-score>
                  <Hsp_score>70</Hsp_score>
                  <Hsp_evalue>5.24249e-32</Hsp_evalue>
                  <Hsp_query-from>1</Hsp_query-from>
                  <Hsp_query-to>73</Hsp_query-to>
                  <Hsp_hit-from>74</Hsp_hit-from>
                  <Hsp_hit-to>1</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>73</Hsp_identity>
                  <Hsp_positive>73</Hsp_positive>
                  <Hsp_gaps>1</Hsp_gaps>
                  <Hsp_align-len>74</Hsp_align-len>
                  <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
                  <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
                  <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
                <Hsp>
                  <Hsp_num>2</Hsp_num>
                  <Hsp_bit-score>71.293</Hsp_bit-score>
                  <Hsp_score>38</Hsp_score>
                  <Hsp_evalue>3.22284e-14</Hsp_evalue>
                  <Hsp_query-from>1735</Hsp_query-from>
                  <Hsp_query-to>1783</Hsp_query-to>
                  <Hsp_hit-from>233</Hsp_hit-from>
                  <Hsp_hit-to>188</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>46</Hsp_identity>
                  <Hsp_positive>46</Hsp_positive>
                  <Hsp_gaps>3</Hsp_gaps>
                  <Hsp_align-len>49</Hsp_align-len>
                  <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
                  <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
                  <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
            <Hit>
              <Hit_num>17</Hit_num>
              <Hit_id>E1</Hit_id>
              <Hit_def>E1-def</Hit_def>
              <Hit_accession>E1</Hit_accession>
              <Hit_len>267</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>130.386</Hsp_bit-score>
                  <Hsp_score>70</Hsp_score>
                  <Hsp_evalue>5.24249e-32</Hsp_evalue>
                  <Hsp_query-from>1</Hsp_query-from>
                  <Hsp_query-to>73</Hsp_query-to>
                  <Hsp_hit-from>81</Hsp_hit-from>
                  <Hsp_hit-to>8</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>73</Hsp_identity>
                  <Hsp_positive>73</Hsp_positive>
                  <Hsp_gaps>1</Hsp_gaps>
                  <Hsp_align-len>74</Hsp_align-len>
                  <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
                  <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
                  <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
                <Hsp>
                  <Hsp_num>2</Hsp_num>
                  <Hsp_bit-score>71.293</Hsp_bit-score>
                  <Hsp_score>38</Hsp_score>
                  <Hsp_evalue>3.22284e-14</Hsp_evalue>
                  <Hsp_query-from>1735</Hsp_query-from>
                  <Hsp_query-to>1783</Hsp_query-to>
                  <Hsp_hit-from>240</Hsp_hit-from>
                  <Hsp_hit-to>195</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>46</Hsp_identity>
                  <Hsp_positive>46</Hsp_positive>
                  <Hsp_gaps>3</Hsp_gaps>
                  <Hsp_align-len>49</Hsp_align-len>
                  <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
                  <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
                  <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
            <Hit>
              <Hit_num>18</Hit_num>
              <Hit_id>F1</Hit_id>
              <Hit_def>F1-def</Hit_def>
              <Hit_accession>F1</Hit_accession>
              <Hit_len>274</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>130.386</Hsp_bit-score>
                  <Hsp_score>70</Hsp_score>
                  <Hsp_evalue>5.24249e-32</Hsp_evalue>
                  <Hsp_query-from>1</Hsp_query-from>
                  <Hsp_query-to>73</Hsp_query-to>
                  <Hsp_hit-from>87</Hsp_hit-from>
                  <Hsp_hit-to>14</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>73</Hsp_identity>
                  <Hsp_positive>73</Hsp_positive>
                  <Hsp_gaps>1</Hsp_gaps>
                  <Hsp_align-len>74</Hsp_align-len>
                  <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
                  <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
                  <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
                <Hsp>
                  <Hsp_num>2</Hsp_num>
                  <Hsp_bit-score>71.293</Hsp_bit-score>
                  <Hsp_score>38</Hsp_score>
                  <Hsp_evalue>3.22284e-14</Hsp_evalue>
                  <Hsp_query-from>1735</Hsp_query-from>
                  <Hsp_query-to>1783</Hsp_query-to>
                  <Hsp_hit-from>246</Hsp_hit-from>
                  <Hsp_hit-to>201</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>46</Hsp_identity>
                  <Hsp_positive>46</Hsp_positive>
                  <Hsp_gaps>3</Hsp_gaps>
                  <Hsp_align-len>49</Hsp_align-len>
                  <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
                  <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
                  <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
            <Hit>
              <Hit_num>19</Hit_num>
              <Hit_id>G1</Hit_id>
              <Hit_def>G1-def</Hit_def>
              <Hit_accession>G1</Hit_accession>
              <Hit_len>267</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>130.386</Hsp_bit-score>
                  <Hsp_score>70</Hsp_score>
                  <Hsp_evalue>5.24249e-32</Hsp_evalue>
                  <Hsp_query-from>1</Hsp_query-from>
                  <Hsp_query-to>73</Hsp_query-to>
                  <Hsp_hit-from>80</Hsp_hit-from>
                  <Hsp_hit-to>7</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>73</Hsp_identity>
                  <Hsp_positive>73</Hsp_positive>
                  <Hsp_gaps>1</Hsp_gaps>
                  <Hsp_align-len>74</Hsp_align-len>
                  <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
                  <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
                  <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
                <Hsp>
                  <Hsp_num>2</Hsp_num>
                  <Hsp_bit-score>71.293</Hsp_bit-score>
                  <Hsp_score>38</Hsp_score>
                  <Hsp_evalue>3.22284e-14</Hsp_evalue>
                  <Hsp_query-from>1735</Hsp_query-from>
                  <Hsp_query-to>1783</Hsp_query-to>
                  <Hsp_hit-from>239</Hsp_hit-from>
                  <Hsp_hit-to>194</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>46</Hsp_identity>
                  <Hsp_positive>46</Hsp_positive>
                  <Hsp_gaps>3</Hsp_gaps>
                  <Hsp_align-len>49</Hsp_align-len>
                  <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
                  <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
                  <Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
            <Hit>
              <Hit_num>1</Hit_num>
              <Hit_id>C1</Hit_id>
              <Hit_def>C1-def</Hit_def>
              <Hit_accession>C1</Hit_accession>
              <Hit_len>568</Hit_len>
              <Hit_hsps>
                <Hsp>
                  <Hsp_num>1</Hsp_num>
                  <Hsp_bit-score>1037.09</Hsp_bit-score>
                  <Hsp_score>561</Hsp_score>
                  <Hsp_evalue>0</Hsp_evalue>
                  <Hsp_query-from>74</Hsp_query-from>
                  <Hsp_query-to>639</Hsp_query-to>
                  <Hsp_hit-from>568</Hsp_hit-from>
                  <Hsp_hit-to>1</Hsp_hit-to>
                  <Hsp_query-frame>1</Hsp_query-frame>
                  <Hsp_hit-frame>-1</Hsp_hit-frame>
                  <Hsp_identity>566</Hsp_identity>
                  <Hsp_positive>566</Hsp_positive>
                  <Hsp_gaps>2</Hsp_gaps>
                  <Hsp_align-len>568</Hsp_align-len>
                  <Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq>
                  <Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq>
                  <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
                </Hsp>
              </Hit_hsps>
            </Hit>
          </Iteration_hits>
          <Iteration_stat>
            <Statistics>
              <Statistics_db-num>78</Statistics_db-num>
              <Statistics_db-len>54018</Statistics_db-len>
              <Statistics_hsp-len>18</Statistics_hsp-len>
              <Statistics_eff-space>93232008</Statistics_eff-space>
              <Statistics_kappa>0.46</Statistics_kappa>
              <Statistics_lambda>1.28</Statistics_lambda>
              <Statistics_entropy>0.85</Statistics_entropy>
            </Statistics>
          </Iteration_stat>
        </Iteration>
      </BlastOutput_iterations>
    </BlastOutput>

结果输出应为:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
  <BlastOutput_program>blastn</BlastOutput_program>
  <BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version>
  <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
  <BlastOutput_db>ABC</BlastOutput_db>
  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
  <BlastOutput_query-def>m151221</BlastOutput_query-def>
  <BlastOutput_query-len>1790</BlastOutput_query-len>
  <BlastOutput_param>
    <Parameters>
      <Parameters_expect>0.001</Parameters_expect>
      <Parameters_sc-match>1</Parameters_sc-match>
      <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
      <Parameters_gap-open>0</Parameters_gap-open>
      <Parameters_gap-extend>0</Parameters_gap-extend>
      <Parameters_filter>L;m;</Parameters_filter>
    </Parameters>
  </BlastOutput_param>
  <BlastOutput_iterations>
    <Iteration>
      <Iteration_iter-num>1</Iteration_iter-num>
      <Iteration_query-ID>Query_1</Iteration_query-ID>
      <Iteration_query-def>m151221</Iteration_query-def>
      <Iteration_query-len>1790</Iteration_query-len>
      <Iteration_hits>
        <Hit>
          <Hit_num>14</Hit_num>
          <Hit_id>A1</Hit_id>
          <Hit_def>A1-def</Hit_def>
          <Hit_accession>A1</Hit_accession>
          <Hit_len>249</Hit_len>
          <Hit_hsps>
            <Hsp>
              <Hsp_num>1</Hsp_num>
              <Hsp_bit-score>130.386</Hsp_bit-score>
              <Hsp_score>70</Hsp_score>
              <Hsp_evalue>5.24249e-32</Hsp_evalue>
              <Hsp_query-from>1</Hsp_query-from>
              <Hsp_query-to>73</Hsp_query-to>
              <Hsp_hit-from>74</Hsp_hit-from>
              <Hsp_hit-to>1</Hsp_hit-to>
              <Hsp_query-frame>1</Hsp_query-frame>
              <Hsp_hit-frame>-1</Hsp_hit-frame>
              <Hsp_identity>73</Hsp_identity>
              <Hsp_positive>73</Hsp_positive>
              <Hsp_gaps>1</Hsp_gaps>
              <Hsp_align-len>74</Hsp_align-len>
              <Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
              <Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
              <Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
            </Hsp>
          </Hit_hsps>
        </Hit>
        <Hit>
          <Hit_num>16</Hit_num>
          <Hit_id>B1</Hit_id>
          <Hit_def>B1-def</Hit_def>
          <Hit_accession>B1</Hit_accession>
          <Hit_len>253</Hit_len>
          <Hit_hsps>
            <Hsp>
              <Hsp_num>2</Hsp_num>
              <Hsp_bit-score>71.293</Hsp_bit-score>
              <Hsp_score>38</Hsp_score>
              <Hsp_evalue>3.22284e-14</Hsp_evalue>
              <Hsp_query-from>1735</Hsp_query-from>
              <Hsp_query-to>1783</Hsp_query-to>
              <Hsp_hit-from>233</Hsp_hit-from>
              <Hsp_hit-to>188</Hsp_hit-to>
              <Hsp_query-frame>1</Hsp_query-frame>
              <Hsp_hit-frame>-1</Hsp_hit-frame>
              <Hsp_identity>46</Hsp_identity>
              <Hsp_positive>46</Hsp_positive>
              <Hsp_gaps>3</Hsp_gaps>
              <Hsp_align-len>49</Hsp_align-len>
              <Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
              <Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
              <Hsp_midline>|||||||||||||||||||||||||||||||||  || |||||||||||</Hsp_midline>
            </Hsp>
          </Hit_hsps>
        </Hit>
        <Hit>
          <Hit_num>1</Hit_num>
          <Hit_id>C1</Hit_id>
          <Hit_def>C1-def</Hit_def>
          <Hit_accession>C1</Hit_accession>
          <Hit_len>568</Hit_len>
          <Hit_hsps>
            <Hsp>
              <Hsp_num>1</Hsp_num>
              <Hsp_bit-score>1037.09</Hsp_bit-score>
              <Hsp_score>561</Hsp_score>
              <Hsp_evalue>0</Hsp_evalue>
              <Hsp_query-from>74</Hsp_query-from>
              <Hsp_query-to>639</Hsp_query-to>
              <Hsp_hit-from>568</Hsp_hit-from>
              <Hsp_hit-to>1</Hsp_hit-to>
              <Hsp_query-frame>1</Hsp_query-frame>
              <Hsp_hit-frame>-1</Hsp_hit-frame>
              <Hsp_identity>566</Hsp_identity>
              <Hsp_positive>566</Hsp_positive>
              <Hsp_gaps>2</Hsp_gaps>
              <Hsp_align-len>568</Hsp_align-len>
              <Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq>
              <Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq>
              <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
            </Hsp>
          </Hit_hsps>
        </Hit>
       </Iteration_hits>
      <Iteration_stat>
        <Statistics>
          <Statistics_db-num>78</Statistics_db-num>
          <Statistics_db-len>54018</Statistics_db-len>
          <Statistics_hsp-len>18</Statistics_hsp-len>
          <Statistics_eff-space>93232008</Statistics_eff-space>
          <Statistics_kappa>0.46</Statistics_kappa>
          <Statistics_lambda>1.28</Statistics_lambda>
          <Statistics_entropy>0.85</Statistics_entropy>
        </Statistics>
      </Iteration_stat>
    </Iteration>
  </BlastOutput_iterations>
</BlastOutput>

请帮助改进以下代码:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
 version="1.0"
 xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 >

<xsl:strip-space elements="*"/>

<xsl:output method="xml" encoding="UTF-8" indent="yes" doctype-public="-//NCBI//NCBI BlastOutput/EN" doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/>


<!-- Identity template -->
<xsl:template match="@*|node()">
 <xsl:copy>
    <xsl:apply-templates select="@*|node()"/>
 </xsl:copy>
</xsl:template>

<xsl:key name="TOP_query_from" match="Iteration_hits/Hit/Hit_hsps/Hsp" use="Hsp_query-from"/>

<xsl:template match="Iteration_hits/Hit/">
 <xsl:copy>
   <xsl:apply-templates select="*[generate-id(.) = generate-id( key ('TOP_query_from', Hsp_query-from))]"/>
 </xsl:copy>
</xsl:template>

</xsl:stylesheet>

1 个答案:

答案 0 :(得分:0)

考虑为密钥连接<Iteration_query-ID><Hsp_num><Hsp_query-from>。虽然身份转换可以简洁,但对于XML的本质,构建架构的方式可能更长,可能是一个可行的解决方案。 Identity Transform强制您删除不需要的节点,并且您还有另外一个挑战:保留其中一个<Hsp>兄弟而不是另一个:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes"
     doctype-public="-//NCBI//NCBI BlastOutput/EN"
     doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/>
<xsl:strip-space elements="*"/>

 <xsl:key name="hspkey" match="Hsp"
      use="concat(ancestor::Iteration/Iteration_query-ID, Hsp_num, Hsp_query-from)"/>

 <xsl:template match="BlastOutput">
  <xsl:copy>
     <xsl:copy-of select="*[position() != last()]"/>
     <xsl:apply-templates select="BlastOutput_iterations"/>
  </xsl:copy>
 </xsl:template>

 <xsl:template match="BlastOutput_iterations">
   <xsl:copy>
    <xsl:apply-templates select="Iteration"/>
   </xsl:copy>
 </xsl:template>

 <xsl:template match="Iteration">
  <xsl:copy>
    <xsl:copy-of select="*[position() &lt; last()-1]"/>
    <xsl:apply-templates select="Iteration_hits"/>
    <xsl:copy-of select="Iteration_stat"/>
  </xsl:copy>
 </xsl:template>

 <xsl:template match="Iteration_hits">
   <xsl:copy>
    <xsl:apply-templates select="Hit[descendant::Hsp[generate-id(.) =
                generate-id(key('hspkey', concat(ancestor::Iteration/Iteration_query-ID, 
                            Hsp_num, Hsp_query-from))[1])]]"/>
   </xsl:copy>
 </xsl:template>

 <xsl:template match="Hit">
  <xsl:copy>
   <xsl:copy-of select="*[position() != last()]"/>
   <xsl:apply-templates select="Hit_hsps"/>
  </xsl:copy>
 </xsl:template>

 <xsl:template match="Hit_hsps">
  <xsl:copy>
    <xsl:copy-of select="Hsp[generate-id(.) =
                generate-id(key('hspkey', concat(ancestor::Iteration/Iteration_query-ID, 
                            Hsp_num, Hsp_query-from))[1])]"/>
  </xsl:copy>
 </xsl:template>

</xsl:stylesheet>