Question

我正在使用返回UTF-16BE字符串的API。我需要将它们转换为UTF-8，以便在UI内部显示（后者又接受char *缓冲区）。为此，我决定使用boost::locale::conv::utf_to_utf()并编写转换例程：

// defined by the API
typedef uint16_t t_wchar_t;
typedef std::basic_string<t_wchar_t> t_wstring;

char* ToUtf8(const t_wstring &utf16)
{
    // print out the input buffer, using printfs instead of cout because I have to
    printf("t_wchar_t = %zu, wchar_t = %zu, char = %zu\n", 
            sizeof(t_wchar_t), sizeof(wchar_t), sizeof(char));
    const t_wchar_t *inBuf = utf16.c_str();
    const size_t inSize = utf16.size();
    // buf2str is my debugging function for printing buffers as raw bytes
    printf("UTF16 size: %zu, buf: %s\n", inSize, 
            buf2str(inBuf, inSize).c_str());

    // make a copy of the input buffer, prepend a BE BOM 
    // (didn't work without it, does not work with it either)
    t_wchar_t *workBuf = new t_wchar_t[inSize + 1];
    workBuf[0] = 0xfeff;
    std::memcpy(workBuf + 1, inBuf, inSize * sizeof(t_wchar_t));
    printf("Workbuf: %s\n", buf2str(workBuf, inSize + 1).c_str());

    // perform conversion, print out the result buffer
    const string utf8Str = boost::locale::conv::utf_to_utf<char>(workBuf, 
            workBuf + inSize + 1);
    const size_t utf8Size = utf8Str.size();
    printf("UTF8 size: %zu, buf: %s\n", utf8Size, 
            buf2str(utf8Str.c_str(), utf8Size).c_str());

    // allocate a char buffer, copy the result there and return the pointer
    char *ret = new char[utf8Size + 1];
    std::memcpy(ret, utf8Str.c_str(), (utf8Size + 1)*sizeof(char));
    printf("Return buf[%zu]: <%s>\n", 
            buf2str(ret, utf8Size + 1).c_str());
    delete [] workBuf;
    return ret;
}

但是，这会在API字符串上运行时返回垃圾以及一些测试数据：

int main()
{
    // simulate the input, make an example UTF-16BE stream from raw bytes
    const unsigned char test[] ={ '\0', 'H', '\0', 'e', '\0', 'l', '\0', 'l', '\0', 'o', 
            '\0', ',', '\0', ' ', '\0', 'w', '\0', 'o', '\0', 'r', '\0', 'l', 
            '\0', 'd', '\0', '!' };
    // create a t_wstring from the 16bit code sequences directly
    const t_wstring testStr(reinterpret_cast<const t_wchar_t*>(test), 13);
    printf("test data: %s\n", buf2str(testStr.c_str(), testStr.size()).c_str());

    char* utf8 = ToUtf8(testStr);
      delete [] utf8;

    return 0;
}

以下是＆＃34; Hello，world！＆＃34;程序的一些输出。串。如您所见，转换后的UTF8缓冲区包含垃圾。

测试数据：[13/26]＆＃39;＆＃39; （0）＆＃39; H＆＃39; （72）＆＃39;＆＃39; （0）＆＃39; e＆＃39; （101）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; o＆＃39; （111）＆＃39;＆＃39; （0）＆＃39;，＆＃39; （44）
  ＆＃39;＆＃39; （0）＆＃39; ＆＃39; （32）＆＃39;＆＃39; （0）＆＃39; w＆＃39; （119）＆＃39;＆＃39; （0）＆＃39; o＆＃39; （111）＆＃39;＆＃39; （0）＆＃39; r＆＃39; （114）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; d＆＃39; （100）＆＃39;＆＃39; （0）＆＃39;！＆＃39; （33）
  t_wchar_t = 2，wchar_t = 4，char = 1
  UTF16尺寸：13，buf：[13/26]＆＃39;＆＃39; （0）＆＃39; H＆＃39; （72）＆＃39;＆＃39; （0）＆＃39; e＆＃39; （101）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; o＆＃39; （111）
  ＆＃39;＆＃39; （0）＆＃39;，＆＃39; （44）＆＃39;＆＃39; （0）＆＃39; ＆＃39; （32）＆＃39;＆＃39; （0）＆＃39; w＆＃39; （119）＆＃39;＆＃39; （0）＆＃39; o＆＃39; （111）＆＃39;＆＃39; （0）＆＃39; r＆＃39; （114）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; d＆＃39; （100）
  ＆＃39;＆＃39; （0）＆＃39;！＆＃39; （33）
  Workbuf：[13/26]＆＃39;＆＃39; （0）＆＃39; H＆＃39; （72）＆＃39;＆＃39; （0）＆＃39; e＆＃39; （101）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; o＆＃39; （111）＆＃39;＆＃39; （0）＆＃39;，＆＃39; （44）
  ＆＃39;＆＃39; （0）＆＃39; ＆＃39; （32）＆＃39;＆＃39; （0）＆＃39; w＆＃39; （119）＆＃39;＆＃39; （0）＆＃39; o＆＃39; （111）＆＃39;＆＃39; （0）＆＃39; r＆＃39; （114）＆＃39;＆＃39; （0）＆＃39; l＆＃39; （108）＆＃39;＆＃39; （0）＆＃39; d＆＃39; （100）＆＃39;＆＃39; （0）＆＃39;！＆＃39; （33）
  UTF8尺寸：42，buf：[42/42]＆＃39;＆＃39; （228）＆＃39;＆＃39; （160）＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）＆＃39;＆＃39; （148）＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）＆＃39;＆＃39; （176）＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）
  ＆＃39;＆＃39; （176）＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）＆＃39;＆＃39; （188）＆＃39;＆＃39; （128）＆＃39;＆＃39; （226）＆＃39;＆＃39; （176）＆＃39;＆＃39; （128）＆＃39;＆＃39; （226）＆＃39;＆＃39; （128）＆＃39;＆＃39; （128）＆＃39;＆＃39; （231）＆＃39;＆＃39; （156）
  ＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）＆＃39;＆＃39; （188）＆＃39;＆＃39; （128）＆＃39;＆＃39; （231）＆＃39;＆＃39; （136）＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）＆＃39;＆＃39; （176）＆＃39;＆＃39; （128）＆＃39;＆＃39; （230）＆＃39;＆＃39; （144）＆＃39;＆＃39; （128）
  ＆＃39;＆＃39; （226）＆＃39;＆＃39; （132）＆＃39;＆＃39; （128）＆＃39;＆＃39; （226）＆＃39;＆＃39; （188）＆＃39;＆＃39; （179）

我做错了什么？感谢。

修改： 感谢@ TheUndeadFish的评论，我在转换之前在工作缓冲区上添加了字节顺序转换，现在它按预期工作：

for (size_t i = 0; i < inSize; ++i)
{
    workBuf[i] = be16toh(workBuf[i]);
}

Answer 1

在你的情况下utf_to_utf看起来像处理小端UTF16一样处理输入。

取前4个字节：

你的意思是00 72 00 101来编码U + 0048 U + 0064。

在编码U + 4800 U + 6400的相反字节序下解释时。

当转换为UTF-8时，它会产生字节e4 a0 80 e6 94 80。

将那些表示为十进制给出了228 160 128 230 148 128，这是你的“垃圾”的第一个值。

使用Boost.Locale的UTF-16BE到UTF-8会产生垃圾

1 个答案: