Question

我不能整齐地使用UTF16编码的字符串。这是解析UTF16编码的字符串的正确方法吗？这适用于ascii编码，但不适用于UTF16。我哪里错了？

我正在用此代码获取空HTML文档。

int rc = -1;
TidyDoc tdoc = tidyCreate();
TidyBuffer output = { 0 };
TidyBuffer errbuf = { 0 };

bool ok = tidyOptSetBool(tdoc, TidyXhtmlOut, yes);  // Convert to XHTML
ok = tidyOptSetBool(tdoc, TidyForceOutput, yes);

rc = tidySetInCharEncoding(tdoc, "utf16le");
rc = tidySetOutCharEncoding(tdoc, "utf16le");

char* test = (char*)L"<html> \
    <head><meta name = 'author' content = 'John Doe'>< / head> \
    <body> \
    <h1>My First Heading< / h1> \
    <p>My first paragraph.< / p> \
    < / body> \
    < / html>";

if (rc >= 0)
{
    rc = tidySetErrorBuffer(tdoc, &errbuf);      // Capture diagnostics
}

if (rc >= 0)
{
    rc = tidyParseString(tdoc, test);           // Parse the input
}

if (rc >= 0)
{
    //rc = tidyCleanAndRepair(tdoc);               // Tidy it up!
}

if (extractText == true)
{
    htmlText = new wstring();
    tidyOptSetBool(tdoc, TidyOutputBOM, no);
    //iterateNode(tdoc, tidyGetBody(tdoc), *htmlText);
}
rc = tidyRunDiagnostics(tdoc);

if (rc >= 0)
{
    // we dont want utf16 BOM
    ok = tidyOptSetBool(tdoc, TidyOutputBOM, no);
    rc = tidySaveBuffer(tdoc, &output);
}

如果我更改为

rc = tidySetInCharEncoding(tdoc, "ascii");
rc = tidySetOutCharEncoding(tdoc, "utf16le");

char* test = "<html>...

有效

HTML Tidy-无法解析utf16字符串

0 个答案: