Here我找到了HTML编码/转义特殊字符的方法。现在我想知道如何在C ++中浏览HTML编码文本?
所以代码库是:
#include <algorithm>
namespace xml {
// Helper for null-terminated ASCII strings (no end of string iterator).
template<typename InIter, typename OutIter>
OutIter copy_asciiz ( InIter begin, OutIter out )
{
while ( *begin != '\0' ) {
*out++ = *begin++;
}
return (out);
}
// XML escaping in it's general form. Note that 'out' is expected
// to an "infinite" sequence.
template<typename InIter, typename OutIter>
OutIter escape ( InIter begin, InIter end, OutIter out )
{
static const char bad[] = "&<>";
static const char* rep[] = {"&", "<", ">"};
static const std::size_t n = sizeof(bad)/sizeof(bad[0]);
for ( ; (begin != end); ++begin )
{
// Find which replacement to use.
const std::size_t i =
std::distance(bad, std::find(bad, bad+n, *begin));
// No need for escaping.
if ( i == n ) {
*out++ = *begin;
}
// Escape the character.
else {
out = copy_asciiz(rep[i], out);
}
}
return (out);
}
}
和
#include <iterator>
#include <string>
namespace xml {
// Get escaped version of "content".
std::string escape ( const std::string& content )
{
std::string result;
result.reserve(content.size());
escape(content.begin(), content.end(), std::back_inserter(result));
return (result);
}
// Escape data on the fly, using "constant" memory.
void escape ( std::istream& in, std::ostream& out )
{
escape(std::istreambuf_iterator<char>(in),
std::istreambuf_iterator<char>(),
std::ostreambuf_iterator<char>(out));
}
}
它 - 代码和平代码 - 适用于:
#include <iostream>
int main ( int, char ** )
{
std::cout << xml::escape("<foo>bar & qux</foo>") << std::endl;
}
所以我想知道 - 如何以这种方式制作HTML unescape?
答案 0 :(得分:2)
看看我是如何解决'&#(\d+);'
字符串的类似问题,即使用boost::spirit,boost::regex_token_iterator,Flex,Perl的数字字符引用(NCR) {3}}
如果您不需要转换all html entities,则正则表达式为&(amp|lt|gt);
。