解析html标签内的内容

时间:2016-05-06 19:16:49

标签: c++ algorithm html-parsing

我正在解析html以获取每个标记内的内容。我有以下代码用于解析/重建Dom树:

#include "DomTreeBuilder.h"

#include <string>
#include <fstream>
#include <limits>
#include <algorithm>
#include "Node.h"
#include "DomTree.h"

using namespace std;

// Populates the passed domTree with the html contents from in
void DomTreeBuilder::populateDomTreeFromString(DomTree &domTree, string &in) {
    // Parse the page
    if (in.empty()) {
        // ToDO: add error handling here
    } else {
        // Check for and loop through tags
        int tagIndex = in.find_first_of('<');
        try {
            do {
                in = in.substr(tagIndex + 1);
                parse(in, domTree);
                tagIndex = in.find_first_of('<');
            } while (tagIndex != -1);
        }
        catch (int e) {
            // TODO:: Improve handling
        }
    }
}

/**
 *  Parses the line string and adds it to the parsed string
 */
void DomTreeBuilder::parse(string &in, DomTree &domTree) {
    // Get tag
    string tag;
    int endTagIndex = in.find_first_of('>');
    // Parse attributes out of tag
    int attributeStart = in.find_first_of(' ');
    if (attributeStart != -1 && attributeStart < endTagIndex) {
        tag = (in.substr(0, attributeStart));
    } else {
        tag = (in.substr(0, endTagIndex));
    }

    // Check if opening or closing tag
    if (in.at(0) == '/'){
        tag = tag.substr(1);
        domTree.closeNode(tag);
    } else {
        Node node;
        node.setTag(tag);

        // Get content for tag
        int nextTag = in.find_first_of('<', endTagIndex);
        if (nextTag != -1) {
            string content = in.substr(endTagIndex + 1, nextTag - endTagIndex - 1);
            decodeHTML(content);
            node.setContent(content);
        }
        domTree.addNode(node);
    }

    in = in.substr(endTagIndex);
}


/**
  * Decodes any character entities
  */
void DomTreeBuilder::decodeHTML(string &line) {
    int tagIndex = line.find_first_of('&');
    while (tagIndex != -1) {
        if (line.substr(tagIndex, 5).compare("&amp;") == 0) { // Verfiy this is correct
            line.erase(tagIndex + 1, 4);
        } else if (line.substr(tagIndex, 4).compare("&lt;") == 0 ) { // verify this
            line[tagIndex] = '<';
            line.erase(tagIndex + 1, 3);
        } else if (line.substr(tagIndex, 4).compare("&gt;") == 0) {
            line[tagIndex] = '>';
            line.erase(tagIndex + 1, 3);
        } else {
            line = line.substr(line.find_first_of(';') + 1); // Remove other entities
        }

        tagIndex = line.find_first_of('&', tagIndex + 1);
    }


    // Remove \r....
    // TODO: This should be replaced with something better
    int index;
    while ((index = line.find("\r")) != -1) {
        line.erase(index, 2);
    }
}

这段代码工作正常,直到我得到一个看起来像这样的html标签:

    <div>
        <p>Testing</p>
        <div>
            <p>1</p>
        </div>
        <p>second</p>
        Third
    </div>

如果我为最外面的div运行node.getContent(),那么它内部就没有任何内容。我的猜测是我需要检查这段代码中是否有更多内容:

    // Get content for tag
    int nextTag = in.find_first_of('<', endTagIndex);
    if (nextTag != -1) {
        string content = in.substr(endTagIndex + 1, nextTag - endTagIndex - 1);
        decodeHTML(content);
        node.setContent(content);
    }

如果您需要任何未在此处发布的代码,您可以查看github repo:https://github.com/Tyler-Hilbert/Dom-Builder/tree/SmashBoards

0 个答案:

没有答案