使用perl解析xml文档的元素(包括属性)和文本节点

时间:2011-03-22 05:14:12

标签: xml perl

我有以下test.xml:

<root>
<A title="A1">
  <B title="B1">
   <C title="C1">
    <params>param=ABC1</params>
    <params>param=ABC2</params>
   </C>
  </B>
</A>
<D title="D1">
  <B title="B2">
   <C title="C2">        
     <params>param=DBC1</params>
     <params>param=DBC2</params>
   </C>
  </B>
</D>
</root>

我需要一个perl代码来解析它并将报告打印为:

NdeName,  Attribute(s),  Param(s)
A          A1
B          B1
C          C1          param=ABC1 param=ABC2
D          D1
B          B2
C          C2          param=DBC1 param=DBC2  

我尝试过使用getElementByTagName('param')getNodeChilds等等但没有成功。另外,我一直使用的是XML::DOM模块。

以下是代码:

 use XML::DOM;
 my $parser = new XML::DOM::Parser;
 my $doc = $parser->parsefile("test.xml");
 my @paramarray=();
 ParseXML($doc,"");

sub ParseXML{
 my $node = $_[0];
 my $indent = $_[1];
 my $title;

 if ($node == null) {
  return;
 }

 my $type = $node->getNodeType();
 if ($type == DOCUMENT_NODE) {
  ParseXML($node->getFirstChild(),"");
  break;
 }  

  if ($type == ELEMENT_NODE) {
  $numberAttributes =0;
  if ($node->getAttributes() !=null){
     $numberAttributes = $node->getAttributes()->getLength();
  }

 for ($loopIndex =0; $loopIndex<$numberAttributes; $loopIndex++) {
     $attribute = ($node->getAttributes())->item($loopIndex);
     if($attribute->getNodeName() eq "title"){
      $title = $attribute->getNodeValue();
     }
 }

 if ($node->getNodeName() eq "params"){
  foreach my $paramvar ($doc->getElementsByTagName("params")) {
     foreach my $child ($paramvar->getChildNodes) {
        push(@paramarray, $child->getData);
     }
  }
 }


 if ($node->getNodeName() ne "root") {
      print $node->getNodeName. ", $title, @paramarray\n";
      @paramarray=();
 } 

 my @childNodes = $node->getChildNodes()
 if (@childNodes != null){
   my $numberChildNodes = $#childNodes + 1;
   my $loopIndex;
  for ($loopIndex =0; $loopIndex<$numberChildNodes; $loopIndex++) {
       ParseXML($childNodes[$loopIndex],$indent);
  }
 }
 }

  if ($type == TEXT_NODE) {
    my  $nodeText = $node->getNodeValue();
  }

 }

3 个答案:

答案 0 :(得分:2)

以下是如何使用XML::Twig实现任务的示例代码:

use strict; use warnings;
use XML::Twig;

my $twig = XML::Twig->new( twig_handlers => {
    '//*[@title]' => sub {
        print join("\t",
            $_->gi,
            $_->att('title'),
            map { $_->trimmed_text } $_->findnodes('params')
        ), "\n";
    },
} );
$twig->parsefile('test.xml');

答案 1 :(得分:1)

首先,始终以

开头
use strict;
use warnings;

这会抓住你可能犯的很多错别字和愚蠢的错误。你遇到的一个大问题是null不是Perl术语。 Perl使用undefdefined函数(虽然在这种情况下您可能不需要defined,因为undef为false且对象通常为true。)

这是一个有点清理的代码版本。它仍然不会产生你要求的输出,但它更接近。

use strict;
use warnings;
use XML::DOM;

my $parser = XML::DOM::Parser->new;
my $doc = $parser->parsefile("test.xml");
my @paramarray;
ParseXML($doc,"");

sub ParseXML {
  my $node = $_[0];
  my $indent = $_[1];
  my $title;

  if (not $node) {
    return;
  }

  my $type = $node->getNodeType();
  if ($type == DOCUMENT_NODE) {
    ParseXML($node->getFirstChild(),"");
    return;           
  }

  if ($type == ELEMENT_NODE) {
    my $numberAttributes =0;
    if ($node->getAttributes()) {
      $numberAttributes = $node->getAttributes()->getLength();
    }

    for (my $loopIndex =0; $loopIndex<$numberAttributes; $loopIndex++) {
      my $attribute = ($node->getAttributes())->item($loopIndex);
      if ($attribute->getNodeName() eq "title") {
        $title = $attribute->getNodeValue();
      }
    }

    if ($node->getNodeName() eq "params") {
      foreach my $paramvar ($doc->getElementsByTagName("params")) {
        foreach my $child ($paramvar->getChildNodes) {
          push(@paramarray, $child->getData);
        }
      }
    } elsif ($node->getNodeName() ne "root") {
      print $node->getNodeName. ", $title, @paramarray\n";
      @paramarray=();
    }

    my @childNodes = $node->getChildNodes(); # was missing semicolon

    if (@childNodes) {
      my $numberChildNodes = $#childNodes + 1;
      my $loopIndex;
      for ($loopIndex =0; $loopIndex<$numberChildNodes; $loopIndex++) {
        ParseXML($childNodes[$loopIndex],$indent);
      }
    }
  }

  if ($type == TEXT_NODE) {
    my $nodeText = $node->getNodeValue();
    # Were you planning on doing something here?
  }
}

答案 2 :(得分:1)

我使用XML::LibXML,所以这是一个使用该模块的解决方案。

use strict;
use warnings;

use XML::LibXML qw( );

my $parser = XML::LibXML->new();
my $doc    = $parser->parse_file("test.xml");
my $root   = $doc->documentElement();

for my $node ($root->findnodes('//*[@title]')) {
    my $name   = $node->nodeName();
    my $title  = $node->getAttribute('title');
    my @params = map $_->textContent, $node->findnodes('params');
    printf("%-10s %-11s %s\n", $name, $title, join(' ', @params));
}

更新:仍然是XML :: LibXML,但这次没有XPath,以便于转换为XML :: DOM。

use strict;
use warnings;

use XML::LibXML qw( XML_ELEMENT_NODE );

sub find_params {
    my ($node) = @_;

    my @params;
    for my $child ($node->childNodes()) {
        next if $child->nodeType != XML_ELEMENT_NODE;
        next if $child->nodeName ne 'params';
        push @params, $child->textContent();        
    }

    return @params;    
}

sub visit {
    my ($node) = @_;
    return if $node->nodeType != XML_ELEMENT_NODE;

    if (my $title_node = $node->getAttributeNode('title')) {
        printf("%-10s %-11s %s\n",
            $node->nodeName(),
            $title_node->getValue(),
            join(' ', find_params($node)),
        );
    }

    visit($_) for $node->childNodes();
}

my $parser = XML::LibXML->new();
my $doc    = $parser->parse_file("test.xml");
my $root   = $doc->documentElement();

visit($root);