使用libxml2解析XML文件时缺少元素值

时间:2018-09-28 20:06:59

标签: c++ xml libxml2

Iam使用libxml2解析XML文件中的特定标签(例如标题)。

解析此XML:

<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs1</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs2</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs3</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs4</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs5</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs6</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs7</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs8</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs9</title>
  </entry>
  <entry>
    <title type="html">Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs10</title>
  </entry>
</feed>

使用此C ++代码

void CXMLManager::processNode(xmlTextReaderPtr reader)
{
    static bool root = true;
    std::string name;

    name  = std::string((const char *) xmlTextReaderConstName (reader));

    if (name == "entry")
    {
        if (root)
        {
            m_name = m_title;
            root = false;
            return;
        }

        static bool closeEntry = true;

        if (closeEntry)
        {
            m_feedBuffer.push_back( CFeed { m_name, m_title, m_updated, m_author, m_link } );

            m_title = "";
        }

        closeEntry = !closeEntry;
    }
    else if (name == "title" && xmlTextReaderNodeType(reader) != XML_READER_TYPE_END_ELEMENT)
    {
        m_title = getElementContent(reader);
        std::cout << "Title: " << m_title << std::endl;
    }
}

std::string CXMLManager::getElementContent(xmlTextReaderPtr reader)
{
    xmlNodePtr node = xmlTextReaderCurrentNode(reader);
    xmlChar* text   = xmlNodeGetContent(node);
    return std::string((const char *) text);
}

void CXMLManager::streamFile(const char *data, size_t size)
{
    xmlTextReaderPtr reader;
    int ret;

    /*
     * Pass some special parsing options to activate DTD attribute defaulting,
     * entities substitution and DTD validation
     */
    reader = xmlReaderForMemory(data, size, NULL, NULL,
                XML_PARSE_DTDATTR |  /* default DTD attributes */
                XML_PARSE_NOENT);    /* substitute entities */

    if (reader != NULL)
    {
        ret = xmlTextReaderRead(reader);

        while (ret == 1)
        {
            processNode(reader);
            ret = xmlTextReaderRead(reader);
        }
    }
    else
    {
        throw CFeedreaderException("FEEDREADER: Failed to parse XML.", E_WRONG_XML);
    }
}

在大多数情况下,iam会获得正确的结果,但一次-iam会得到空字符串(甚至认为它在XML中是正确的):

Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs1
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs2
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs3
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs4

Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs6
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs7
Swedish ISP spanked for sexist 'distracted boyfriend' advert for developer jobs8

我在解析XML及其正确性之前已经检查了很多次,所以我不知道这里可能是什么问题。此输入会定期丢失第5个字符串。

1 个答案:

答案 0 :(得分:2)

static局部变量很可能会影响您的处理。请记住,static局部变量在函数调用之间保留其值。一旦streamFile()退出并再次被调用,您的static变量仍将具有其先前的值,它们不会被重置回其原始值。您必须将它们更改为CXMLManager类的成员,以便streamFile()可以在每次调用时重置它们。

我不建议使用单个函数来尝试处理您需要解析的每个可能的节点。我会将阅读内容分解为独立的功能,这些功能在XML文档的每个级别都有自己的职责,如下所示:

void CXMLManager::readFeed(xmlTextReaderPtr reader)
{
    // read attributes if needed...

    if (xmlTextReaderIsEmptyElement(reader))
        return;

    int depth = xmlTextReaderNodeDepth(reader);
    int ret;

    while ((ret = xmlTextReaderRead(reader)) == 1)
    {
        switch (xmlTextReaderNodeType(reader))
        {
            case XML_READER_TYPE_ELEMENT:
            {
                if (xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "entry"))
                {
                    CFeed entry;
                    readFeedEntry(reader, entry);
                    m_feedBuffer.push_back(entry);
                }
                break;
            }

            case XML_READER_TYPE_END_ELEMENT:
            {
                if ((xmlTextReaderNodeDepth(reader) == depth)
                    /*&& xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "feed")*/)
                {
                    return;
                }
                break;
            }
        }
    }

    if (ret == -1)
        throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}

void CXMLManager::readFeedEntry(xmlTextReaderPtr reader, CFeed &entry)
{
    // read attributes if needed...

    if (xmlTextReaderIsEmptyElement(reader))
        return;

    int depth = xmlTextReaderNodeDepth(reader);
    int ret;

    while ((ret = xmlTextReaderRead(reader)) == 1)
    {
        switch (xmlTextReaderNodeType(reader))
        {
            case XML_READER_TYPE_ELEMENT:
            {
                const xmlChar *name = xmlTextReaderConstLocalName(reader);

                if (xmlStrEqual(name, BAD_CAST "title"))
                {
                    readText(reader, entry.m_title/*, BAD_CAST "title"*/);
                    std::cout << "Title: " << entry.m_title << std::endl;
                }
                // else other <entry> children as needed ...

                break;
            }

            case XML_READER_TYPE_END_ELEMENT:
            {
                if ((xmlTextReaderNodeDepth(reader) == depth)
                    /*&& xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "entry")*/)
                {
                    return;
                }
                break;
            }
        }
    }

    if (ret == -1)
        throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}

void CXMLManager::readText(xmlTextReaderPtr reader, std::string &text/*, const xmlChar *tagName */)
{
    text.clear();

    if (xmlTextReaderIsEmptyElement(reader))
        return;

    int depth = xmlTextReaderNodeDepth(reader);
    int ret;

    while ((ret = xmlTextReaderRead(reader)) == 1)
    {
        switch (xmlTextReaderNodeType(reader))
        {
            // TODO: handle XML_READER_TYPE_ELEMENT if you need to treat
            // embedded XML elements as part of the text, such as for
            // formatting instructions (like <b>, <i>, etc)...

            case XML_READER_TYPE_TEXT:
            {
                const xmlChar *value = xmlTextReaderConstValue(reader);
                text += reinterpret_cast<const char*>(value);
                break;
            }

            case XML_READER_TYPE_END_ELEMENT:
            {
                if ((xmlTextReaderNodeDepth(reader) == depth)
                    /*&& xmlStrEqual(name, tagName)*/)
                {
                    return;
                }
                break;
            }
        }
    }

    if (ret == -1)
        throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}

void CXMLManager::streamFile(const char *data, size_t size)
{
    /*
     * Pass some special parsing options to activate DTD attribute defaulting,
     * entities substitution and DTD validation
     */
    xmlTextReaderPtr reader = xmlReaderForMemory(data, size, NULL, NULL,
                XML_PARSE_DTDATTR |  /* default DTD attributes */
                XML_PARSE_NOENT);    /* substitute entities */

    if (!reader)
        throw CFeedreaderException("FEEDREADER: Failed to parse XML.", E_WRONG_XML);

    std::unique_ptr<xmlTextReader, decltype(xmlFreeTextReader)> reader_deleter(reader, xmlFreeTextReader);
    int ret;

    while ((ret = xmlTextReaderRead(reader)) == 1)
    {
        if ((xmlTextReaderNodeType(reader) == XML_READER_TYPE_ELEMENT)
            && xmlStrEqual(xmlTextReaderConstLocalName(reader), BAD_CAST "feed"))
        {
            readFeed(reader);
        }
    }

    if (ret == -1)
        throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}

或者,我建议完全放弃所有帮助程序功能,而只在streamFile()本身内部进行所有操作,使用本地状态机遍历reader,例如:

void CXMLManager::streamFile(const char *data, size_t size)
{
    /*
     * Pass some special parsing options to activate DTD attribute defaulting,
     * entities substitution and DTD validation
     */
    xmlTextReaderPtr reader = xmlReaderForMemory(data, size, NULL, NULL,
                XML_PARSE_DTDATTR |  /* default DTD attributes */
                XML_PARSE_NOENT);    /* substitute entities */

    if (!reader)
        throw CFeedreaderException("FEEDREADER: Failed to parse XML.", E_WRONG_XML);

    std::unique_ptr<xmlTextReader, decltype(xmlFreeTextReader)> reader_deleter(reader, xmlFreeTextReader);

    std::string name, title, updated, author, link, text;
    int feedDepth = -1;
    int entryDepth = -1;
    int textDepth = -1;
    int ret;

    while ((ret = xmlTextReaderRead(reader)) == 1)
    {
        switch (xmlTextReaderNodeType(reader))
        {
            case XML_READER_TYPE_ELEMENT:
            {
                if (textDepth != -1)
                {
                    // TODO: handle this case if you need to treat embedded
                    // XML elements as part of the text, such as for formatting
                    // instructions (like <b>, <i>, etc)...
                    break;
                }

                const xmlChar *name = xmlTextReaderConstLocalName(reader);

                if (feedDepth == -1)
                {
                    if (xmlStrEqual(name, BAD_CAST "feed"))
                    {
                        // read attributes if needed...

                        feedDepth == xmlTextReaderNodeDepth(reader);
                    }
                }
                else if (entryDepth == -1)
                {
                    if (xmlStrEqual(name, BAD_CAST "entry"))
                    {
                        name = title = updated = author = link = text = "";

                        // read attributes if needed...

                        if (xmlTextReaderIsEmptyElement(reader))
                            m_feedBuffer.push_back( CFeed { name, title, updated, author, link } );
                        else
                            entryDepth == xmlTextReaderNodeDepth(reader);
                    }
                }
                else if (xmlStrEqual(name, BAD_CAST "title"))
                {
                    text.clear();
                    if (!xmlTextReaderIsEmptyElement(reader))
                        textDepth = xmlTextReaderNodeDepth(reader);
                    else
                        textDepth = -1;
                }
                // else other <entry> children as needed ...

                break;
            }

            case XML_READER_TYPE_TEXT:
            {
                if (textDepth != -1)
                {
                    const xmlChar *value = xmlTextReeaderConstValue(reader);
                    text += reinterpret_cast<const char*>(value);
                }

                break;
            }

            case XML_READER_TYPE_END_ELEMENT:
            {
                const xmlChar *name = xmlTextReaderConstLocalName(reader);

                if (textDepth != -1)
                {
                    if ((xmlTextReaderNodeDepth(reader) == textDepth)
                        /*&& xmlStrEqual(name, BAD_CAST "title")*/)
                    {
                        textDepth = -1;

                        title = text;
                        text.clear();

                        std::cout << "Title: " << title << std::endl;
                    }
                    // else other <entry> children as needed ...
                }
                else if (entryDepth != -1)
                {
                    if ((xmlTextReaderNodeDepth(reader) == entryDepth)
                        /*&& xmlStrEqual(name, BAD_CAST "entry")*/)
                    {
                        entryDepth = -1;
                        m_feedBuffer.push_back( CFeed { name, title, updated, author, link } );
                    }
                }
                else if (feedDepth != -1)
                {
                    if ((xmlTextReaderNodeDepth(reader) == feedDepth)
                        /*&& xmlStrEqual(name, BAD_CAST "feed")*/)
                    {
                        feedDepth = -1;
                    }
                }

                break;
            }
        }
    }

    if (ret == -1)
        throw CFeedreaderException("FEEDREADER: Failed to read XML.", ...);
}