我正在解析html以获取每个标记内的内容。我有以下代码用于解析/重建Dom树:
#include "DomTreeBuilder.h"
#include <string>
#include <fstream>
#include <limits>
#include <algorithm>
#include "Node.h"
#include "DomTree.h"
using namespace std;
// Populates the passed domTree with the html contents from in
void DomTreeBuilder::populateDomTreeFromString(DomTree &domTree, string &in) {
// Parse the page
if (in.empty()) {
// ToDO: add error handling here
} else {
// Check for and loop through tags
int tagIndex = in.find_first_of('<');
try {
do {
in = in.substr(tagIndex + 1);
parse(in, domTree);
tagIndex = in.find_first_of('<');
} while (tagIndex != -1);
}
catch (int e) {
// TODO:: Improve handling
}
}
}
/**
* Parses the line string and adds it to the parsed string
*/
void DomTreeBuilder::parse(string &in, DomTree &domTree) {
// Get tag
string tag;
int endTagIndex = in.find_first_of('>');
// Parse attributes out of tag
int attributeStart = in.find_first_of(' ');
if (attributeStart != -1 && attributeStart < endTagIndex) {
tag = (in.substr(0, attributeStart));
} else {
tag = (in.substr(0, endTagIndex));
}
// Check if opening or closing tag
if (in.at(0) == '/'){
tag = tag.substr(1);
domTree.closeNode(tag);
} else {
Node node;
node.setTag(tag);
// Get content for tag
int nextTag = in.find_first_of('<', endTagIndex);
if (nextTag != -1) {
string content = in.substr(endTagIndex + 1, nextTag - endTagIndex - 1);
decodeHTML(content);
node.setContent(content);
}
domTree.addNode(node);
}
in = in.substr(endTagIndex);
}
/**
* Decodes any character entities
*/
void DomTreeBuilder::decodeHTML(string &line) {
int tagIndex = line.find_first_of('&');
while (tagIndex != -1) {
if (line.substr(tagIndex, 5).compare("&") == 0) { // Verfiy this is correct
line.erase(tagIndex + 1, 4);
} else if (line.substr(tagIndex, 4).compare("<") == 0 ) { // verify this
line[tagIndex] = '<';
line.erase(tagIndex + 1, 3);
} else if (line.substr(tagIndex, 4).compare(">") == 0) {
line[tagIndex] = '>';
line.erase(tagIndex + 1, 3);
} else {
line = line.substr(line.find_first_of(';') + 1); // Remove other entities
}
tagIndex = line.find_first_of('&', tagIndex + 1);
}
// Remove \r....
// TODO: This should be replaced with something better
int index;
while ((index = line.find("\r")) != -1) {
line.erase(index, 2);
}
}
这段代码工作正常,直到我得到一个看起来像这样的html标签:
<div>
<p>Testing</p>
<div>
<p>1</p>
</div>
<p>second</p>
Third
</div>
如果我为最外面的div运行node.getContent(),那么它内部就没有任何内容。我的猜测是我需要检查这段代码中是否有更多内容:
// Get content for tag
int nextTag = in.find_first_of('<', endTagIndex);
if (nextTag != -1) {
string content = in.substr(endTagIndex + 1, nextTag - endTagIndex - 1);
decodeHTML(content);
node.setContent(content);
}
如果您需要任何未在此处发布的代码,您可以查看github repo:https://github.com/Tyler-Hilbert/Dom-Builder/tree/SmashBoards