libxml2 sax解析器 - 提取文本节点

时间:2013-10-06 17:19:41

标签: c++ xml sax libxml2 saxparser

我想从xml输入中提取文本节点中的值。我有来自web的以下代码,因为libxml的官方文档有许多断开的链接,其中sax解析器是一个。请帮我获取文本节点的值。在startElementNs中,当我试图寻找我的文本节点时,我得到NULL。感谢这里的任何帮助。

我的xml看起来像这样:

<a>
   <b>
      <c> text values </c>
   </b>
</a>

我的代码如下所示:

#include <stdio.h>
#include <assert.h>
#include <memory.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <string>


class ParseFSM
{
public:
   /** SAX2 callback when an element start has been detected by the parser. It provides the namespace informations for the element, as well as the new namespace declarations on the element.
      ctx:  the user data (XML parser context)
      localname:  the local name of the element
      prefix:  the element namespace prefix if available
      URI:  the element namespace name if available
      nb_namespaces: number of namespace definitions on that node
      namespaces: pointer to the array of prefix/URI pairs namespace definitions
      nb_attributes: the number of attributes on that node
      nb_defaulted:  the number of defaulted attributes. The defaulted ones are at the end of the array
      attributes: pointer to the array of (localname/prefix/URI/value/end) attribute values.
      **/
  static void startElementNs (void *ctx,
                  const xmlChar * localname,
                  const xmlChar * prefix,
                  const xmlChar * URI,
                  int nb_namespaces,
                  const xmlChar ** namespaces,
                  int nb_attributes,
                  int nb_defaulted, const xmlChar ** attributes)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    printf ("startElementNs: name = '%s' prefix = '%s' uri = (%p)'%s'\n", localname, prefix, URI, URI);
    for (int indexNamespace = 0; indexNamespace < nb_namespaces; ++indexNamespace)
      {
        const xmlChar *prefix = namespaces[indexNamespace * 2];
        const xmlChar *nsURI = namespaces[indexNamespace * 2 + 1];
        printf ("  namespace: name='%s' uri=(%p)'%s'\n", prefix, nsURI, nsURI);
      }
    unsigned int index = 0;
    for (int indexAttribute = 0; indexAttribute < nb_attributes; ++indexAttribute, index += 5)
      {
        const xmlChar *localname = attributes[index];
        const xmlChar *prefix = attributes[index + 1];
        const xmlChar *nsURI = attributes[index + 2];
        const xmlChar *valueBegin = attributes[index + 3];
        const xmlChar *valueEnd = attributes[index + 4];
        std::string value ((const char *) valueBegin, (const char *) valueEnd);
        printf ("  %sattribute: localname='%s', prefix='%s', uri=(%p)'%s', value='%s'\n", indexAttribute >= (nb_attributes - nb_defaulted) ? "defaulted " : "", localname, prefix, nsURI, nsURI, value.c_str ());
      }
  }
   /** SAX2 callback when an element end has been detected by the parser. It provides the namespace informations for the element.
      ctx:  the user data (XML parser context)
      localname:  the local name of the element
      prefix:  the element namespace prefix if available
      URI:  the element namespace name if available
      **/
  static void endElementNs (void *ctx,
                const xmlChar * localname,
                const xmlChar * prefix, const xmlChar * URI)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    printf ("endElementNs: name = '%s' prefix = '%s' uri = '%s'\n", localname,
        prefix, URI);
  }
   /** Display and format an error messages, callback.
      ctx:  an XML parser context
      msg:  the message to display/transmit
      ...:  extra parameters for the message display
      */
  static void error (void *ctx, const char *msg, ...)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    va_list args;
    va_start (args, msg);
    vprintf (msg, args);
    va_end (args);
  }

   /** Display and format a warning messages, callback.
      ctx:  an XML parser context
      msg:  the message to display/transmit
      ...:  extra parameters for the message display
      */
  static void warning (void *ctx, const char *msg, ...)
  {
    ParseFSM & fsm = *(static_cast < ParseFSM * >(ctx));
    va_list args;
    va_start (args, msg);
    vprintf (msg, args);
    va_end (args);
  }
};
int
main (int argc, const char *argv[])
{
  std::string xmlIn = "<a><b><c> text values </c> </b> </a>"
  /*
   * this initialize the library and check potential ABI mismatches
   * between the version it was compiled for and the actual shared
   * library used.
   */
  LIBXML_TEST_VERSION xmlSAXHandler saxHandler; // See http://xmlsoft.org/html/libxml-tree.html#xmlSAXHandler
  memset (&saxHandler, 0, sizeof (saxHandler));
  // Using xmlSAXVersion( &saxHandler, 2 ) generate crash as it sets plenty of other pointers...
  saxHandler.initialized = XML_SAX2_MAGIC;  // so we do this to force parsing as SAX2.
  saxHandler.startElementNs = &ParseFSM::startElementNs;
  saxHandler.endElementNs = &ParseFSM::endElementNs;
  saxHandler.warning = &ParseFSM::warning;
  saxHandler.error = &ParseFSM::error;

  ParseFSM fsm;
  int result =
    xmlSAXUserParseMemory (&saxHandler, &fsm, xmlIn.c_str (),
               int (xmlIn.length ()));
  if (result != 0)
    {
      printf ("Failed to parse document.\n");
      return 1;
    }

  /*
   * Cleanup function for the XML library.
   */
  xmlCleanupParser ();
  /*
   * this is to debug memory for regression tests
   */
  xmlMemoryDump ();

  return 0;
}

1 个答案:

答案 0 :(得分:1)

  1. 您需要使用字符回调

    void characters(void * user_data,     const xmlChar * ch,     int len);

  2. 字符串不是空终止,你需要使用ch,len来确定字符串

  3. 此回调的另一个问题是它可以在start和end元素之间多次调用。所以你不能盲目地假设你在回调中得到的是标签之间的字符串。您可能需要使用字符串构建器或某些东西来收集字符串。

  4.   

    在回调中,您可能希望将字符复制到   一些其他缓冲区,以便可以从endElement回调中使用它。   要稍微优化此回调,您可以调整回调以便   如果解析器处于某种状态,它只复制字符。   请注意,可以在两者之间多次调用字符回调   调用startElement和endElement。

    希望这能回答你,即使其已故的其他人可能会得到帮助