我想使用字段" Hsp_query-from"从长BLAST输出文件中过滤掉重复的条目。即我的XML输出文件是根据这个字段排序的,我想只选择每个唯一的第一个条目" Hsp_query-from"值。此外,这应该为" Hsp_num" 1,并且分别用于" Hsp_num" 2.我的示例输入文件如下所示:
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version>
<BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
<BlastOutput_db>ABC</BlastOutput_db>
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
<BlastOutput_query-def>m151221</BlastOutput_query-def>
<BlastOutput_query-len>1790</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>0.001</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
<Parameters_gap-open>0</Parameters_gap-open>
<Parameters_gap-extend>0</Parameters_gap-extend>
<Parameters_filter>L;m;</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>m151221</Iteration_query-def>
<Iteration_query-len>1790</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>14</Hit_num>
<Hit_id>A1</Hit_id>
<Hit_def>A1-def</Hit_def>
<Hit_accession>A1</Hit_accession>
<Hit_len>249</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>74</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>15</Hit_num>
<Hit_id>D1</Hit_id>
<Hit_def>D1-def</Hit_def>
<Hit_accession>D1</Hit_accession>
<Hit_len>261</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>80</Hsp_hit-from>
<Hsp_hit-to>7</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>16</Hit_num>
<Hit_id>B1</Hit_id>
<Hit_def>B1-def</Hit_def>
<Hit_accession>B1</Hit_accession>
<Hit_len>253</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>74</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>233</Hsp_hit-from>
<Hsp_hit-to>188</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>17</Hit_num>
<Hit_id>E1</Hit_id>
<Hit_def>E1-def</Hit_def>
<Hit_accession>E1</Hit_accession>
<Hit_len>267</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>81</Hsp_hit-from>
<Hsp_hit-to>8</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>240</Hsp_hit-from>
<Hsp_hit-to>195</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>18</Hit_num>
<Hit_id>F1</Hit_id>
<Hit_def>F1-def</Hit_def>
<Hit_accession>F1</Hit_accession>
<Hit_len>274</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>87</Hsp_hit-from>
<Hsp_hit-to>14</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>246</Hsp_hit-from>
<Hsp_hit-to>201</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>19</Hit_num>
<Hit_id>G1</Hit_id>
<Hit_def>G1-def</Hit_def>
<Hit_accession>G1</Hit_accession>
<Hit_len>267</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>80</Hsp_hit-from>
<Hsp_hit-to>7</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>239</Hsp_hit-from>
<Hsp_hit-to>194</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>C1</Hit_id>
<Hit_def>C1-def</Hit_def>
<Hit_accession>C1</Hit_accession>
<Hit_len>568</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1037.09</Hsp_bit-score>
<Hsp_score>561</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>74</Hsp_query-from>
<Hsp_query-to>639</Hsp_query-to>
<Hsp_hit-from>568</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>566</Hsp_identity>
<Hsp_positive>566</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>568</Hsp_align-len>
<Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq>
<Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>78</Statistics_db-num>
<Statistics_db-len>54018</Statistics_db-len>
<Statistics_hsp-len>18</Statistics_hsp-len>
<Statistics_eff-space>93232008</Statistics_eff-space>
<Statistics_kappa>0.46</Statistics_kappa>
<Statistics_lambda>1.28</Statistics_lambda>
<Statistics_entropy>0.85</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
结果输出应为:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastn</BlastOutput_program>
<BlastOutput_version>BLASTN 2.3.0+</BlastOutput_version>
<BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
<BlastOutput_db>ABC</BlastOutput_db>
<BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
<BlastOutput_query-def>m151221</BlastOutput_query-def>
<BlastOutput_query-len>1790</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_expect>0.001</Parameters_expect>
<Parameters_sc-match>1</Parameters_sc-match>
<Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
<Parameters_gap-open>0</Parameters_gap-open>
<Parameters_gap-extend>0</Parameters_gap-extend>
<Parameters_filter>L;m;</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>m151221</Iteration_query-def>
<Iteration_query-len>1790</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>14</Hit_num>
<Hit_id>A1</Hit_id>
<Hit_def>A1-def</Hit_def>
<Hit_accession>A1</Hit_accession>
<Hit_len>249</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>130.386</Hsp_bit-score>
<Hsp_score>70</Hsp_score>
<Hsp_evalue>5.24249e-32</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>73</Hsp_query-to>
<Hsp_hit-from>74</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>73</Hsp_identity>
<Hsp_positive>73</Hsp_positive>
<Hsp_gaps>1</Hsp_gaps>
<Hsp_align-len>74</Hsp_align-len>
<Hsp_qseq>TATATGATAATCATCGCAAGACCGGCAACAGGAT-CAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_qseq>
<Hsp_hseq>TATATGATAATCATCGCAAGACCGGCAACAGGATTCAATCTTAAGAAACTTTATTGCCAAATGTTTGAACGATC</Hsp_hseq>
<Hsp_midline>|||||||||||||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>16</Hit_num>
<Hit_id>B1</Hit_id>
<Hit_def>B1-def</Hit_def>
<Hit_accession>B1</Hit_accession>
<Hit_len>253</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>2</Hsp_num>
<Hsp_bit-score>71.293</Hsp_bit-score>
<Hsp_score>38</Hsp_score>
<Hsp_evalue>3.22284e-14</Hsp_evalue>
<Hsp_query-from>1735</Hsp_query-from>
<Hsp_query-to>1783</Hsp_query-to>
<Hsp_hit-from>233</Hsp_hit-from>
<Hsp_hit-to>188</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>46</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>3</Hsp_gaps>
<Hsp_align-len>49</Hsp_align-len>
<Hsp_qseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGCGCTAATATTTTGTTTT</Hsp_qseq>
<Hsp_hseq>ACCGCGCGCGATAATTTATCCTAGTTTGCGCGC--TA-TATTTTGTTTT</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||| || |||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>C1</Hit_id>
<Hit_def>C1-def</Hit_def>
<Hit_accession>C1</Hit_accession>
<Hit_len>568</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>1037.09</Hsp_bit-score>
<Hsp_score>561</Hsp_score>
<Hsp_evalue>0</Hsp_evalue>
<Hsp_query-from>74</Hsp_query-from>
<Hsp_query-to>639</Hsp_query-to>
<Hsp_hit-from>568</Hsp_hit-from>
<Hsp_hit-to>1</Hsp_hit-to>
<Hsp_query-frame>1</Hsp_query-frame>
<Hsp_hit-frame>-1</Hsp_hit-frame>
<Hsp_identity>566</Hsp_identity>
<Hsp_positive>566</Hsp_positive>
<Hsp_gaps>2</Hsp_gaps>
<Hsp_align-len>568</Hsp_align-len>
<Hsp_qseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAA-CCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCC-AAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_qseq>
<Hsp_hseq>TGCAGGTCGACTCAGATCTGGGTAACTGGCCTAACTGGCCTTGGAGGAGCTGGCAACTCAAAATCCCTTTGCCAAAAACCAACATCATGCCATCCACCATGCTTGTATCCAGCTGCGCGCAATGTACCCCGGGCTGTGTATCCCAAAGCCTCATGCAACCTAACAGATGGATCGTTTGGAAGGCCTATAACAGCAACCACAGACTTAAAACCTTGCGCCTCCATAGACTTAAGCAAATGTGTGTACAATGTGGATCCTAGGCCCAACCTTTGATGCCTATGTGACACGTAAACAGTACTCTCAACTGTCCAATCGTAAGCGTTCCTAGCCTTCCAGGGCCCAGCGTAAGCAATACCAGCCACAACACCCTCAACCTCAGCAACCAACCAAGGGTATCTATCTTGCAACCTCTCTAGATCATCAATCCACTCTTGTGGTGTTTGTGGCTCTGTCCTAAAGTTCACTGTAGACGTCTCAATGTAATGGTTAACGATATCACAAACCGCGGCCATATCAGCTGCTGTAGCTGGCCTAATCTCAACTGGTCTCCTCTCCGGAGACATGTCGA</Hsp_hseq>
<Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>78</Statistics_db-num>
<Statistics_db-len>54018</Statistics_db-len>
<Statistics_hsp-len>18</Statistics_hsp-len>
<Statistics_eff-space>93232008</Statistics_eff-space>
<Statistics_kappa>0.46</Statistics_kappa>
<Statistics_lambda>1.28</Statistics_lambda>
<Statistics_entropy>0.85</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
请帮助改进以下代码:
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
>
<xsl:strip-space elements="*"/>
<xsl:output method="xml" encoding="UTF-8" indent="yes" doctype-public="-//NCBI//NCBI BlastOutput/EN" doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/>
<!-- Identity template -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:key name="TOP_query_from" match="Iteration_hits/Hit/Hit_hsps/Hsp" use="Hsp_query-from"/>
<xsl:template match="Iteration_hits/Hit/">
<xsl:copy>
<xsl:apply-templates select="*[generate-id(.) = generate-id( key ('TOP_query_from', Hsp_query-from))]"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
答案 0 :(得分:0)
考虑为密钥连接<Iteration_query-ID>
,<Hsp_num>
和<Hsp_query-from>
。虽然身份转换可以简洁,但对于XML的本质,构建架构的方式可能更长,可能是一个可行的解决方案。 Identity Transform强制您删除不需要的节点,并且您还有另外一个挑战:保留其中一个<Hsp>
兄弟而不是另一个:
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="UTF-8" indent="yes"
doctype-public="-//NCBI//NCBI BlastOutput/EN"
doctype-system="http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"/>
<xsl:strip-space elements="*"/>
<xsl:key name="hspkey" match="Hsp"
use="concat(ancestor::Iteration/Iteration_query-ID, Hsp_num, Hsp_query-from)"/>
<xsl:template match="BlastOutput">
<xsl:copy>
<xsl:copy-of select="*[position() != last()]"/>
<xsl:apply-templates select="BlastOutput_iterations"/>
</xsl:copy>
</xsl:template>
<xsl:template match="BlastOutput_iterations">
<xsl:copy>
<xsl:apply-templates select="Iteration"/>
</xsl:copy>
</xsl:template>
<xsl:template match="Iteration">
<xsl:copy>
<xsl:copy-of select="*[position() < last()-1]"/>
<xsl:apply-templates select="Iteration_hits"/>
<xsl:copy-of select="Iteration_stat"/>
</xsl:copy>
</xsl:template>
<xsl:template match="Iteration_hits">
<xsl:copy>
<xsl:apply-templates select="Hit[descendant::Hsp[generate-id(.) =
generate-id(key('hspkey', concat(ancestor::Iteration/Iteration_query-ID,
Hsp_num, Hsp_query-from))[1])]]"/>
</xsl:copy>
</xsl:template>
<xsl:template match="Hit">
<xsl:copy>
<xsl:copy-of select="*[position() != last()]"/>
<xsl:apply-templates select="Hit_hsps"/>
</xsl:copy>
</xsl:template>
<xsl:template match="Hit_hsps">
<xsl:copy>
<xsl:copy-of select="Hsp[generate-id(.) =
generate-id(key('hspkey', concat(ancestor::Iteration/Iteration_query-ID,
Hsp_num, Hsp_query-from))[1])]"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>