std::string text = "á";
“á”是双字节字符(假设采用UTF-8编码) 所以下面的行打印2.
std::cout << text.size() << "\n";
但std::cout
仍能正确打印文字。
std::cout << text << "\n";
我将text
传递给boost::property_tree::ptree
,然后传递给write_json
boost::property_tree::ptree root;
root.put<std::string>("text", text);
std::stringstream ss;
boost::property_tree::json_parser::write_json(ss, root);
std::cout << ss.str() << "\n";
结果是
{
"text": "\u00C3\u00A1"
}
文本等于“¡”,它与“á”不同。
是否可以在不切换到std::wstring
的情况下解决此问题?更改库(boost::property_tree::ptree
)是否可以解决此问题?
答案 0 :(得分:10)
我找到了一些解决方案。
通常,您需要为boost::property_tree::json_parser::create_escapes
指定[Ch=Char]
模板,以提供“特殊场合无错误转义”。
JSON标准假设所有字符串都是使用“\ uXXXX”转义的UTF-16编码,但某些库支持使用“\ xXX”转义的UTF-8编码。如果JSON文件可以用UTF-8编码,你可以传递高于0x7F的所有字符,这是用于原始功能。
我在使用boost::property_tree::json_parser::write_json
之前放了这段代码。它来自boost_1_49_0/boost/property_tree/detail/json_parser_write.hpp
:
namespace boost { namespace property_tree { namespace json_parser
{
// Create necessary escape sequences from illegal characters
template<>
std::basic_string<char> create_escapes(const std::basic_string<char> &s)
{
std::basic_string<char> result;
std::basic_string<char>::const_iterator b = s.begin();
std::basic_string<char>::const_iterator e = s.end();
while (b != e)
{
// This assumes an ASCII superset. But so does everything in PTree.
// We escape everything outside ASCII, because this code can't
// handle high unicode characters.
if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
(*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0xFF) //it fails here because char are signed
|| (*b >= -0x80 && *b < 0 ) ) // this will pass UTF-8 signed chars
result += *b;
else if (*b == char('\b')) result += char('\\'), result += char('b');
else if (*b == char('\f')) result += char('\\'), result += char('f');
else if (*b == char('\n')) result += char('\\'), result += char('n');
else if (*b == char('\r')) result += char('\\'), result += char('r');
else if (*b == char('/')) result += char('\\'), result += char('/');
else if (*b == char('"')) result += char('\\'), result += char('"');
else if (*b == char('\\')) result += char('\\'), result += char('\\');
else
{
const char *hexdigits = "0123456789ABCDEF";
typedef make_unsigned<char>::type UCh;
unsigned long u = (std::min)(static_cast<unsigned long>(
static_cast<UCh>(*b)),
0xFFFFul);
int d1 = u / 4096; u -= d1 * 4096;
int d2 = u / 256; u -= d2 * 256;
int d3 = u / 16; u -= d3 * 16;
int d4 = u;
result += char('\\'); result += char('u');
result += char(hexdigits[d1]); result += char(hexdigits[d2]);
result += char(hexdigits[d3]); result += char(hexdigits[d4]);
}
++b;
}
return result;
}
} } }
我得到的输出:
{
"text": "aáb"
}
此外,函数boost::property_tree::json_parser::a_unicode
在将已转义的unicode字符读取到已签名的字符时也存在类似的问题。
答案 1 :(得分:0)
Boost 在 1.59 版本中修复了它。如果你想升级版本,你需要小心。您可以从下面检查发生了什么变化。 https://www.boost.org/users/history/version_1_59_0.html
答案 2 :(得分:-1)
支持基本多语言平面:
template<class Ch>
std::basic_string<Ch> create_escapes(const std::basic_string<Ch> &s)
{
std::basic_string<Ch> result;
typename std::basic_string<Ch>::const_iterator b = s.begin();
typename std::basic_string<Ch>::const_iterator e = s.end();
while (b != e)
{
if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
(*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0x80))
result += *b;
else if (*b == Ch('\b')) result += Ch('\\'), result += Ch('b');
else if (*b == Ch('\f')) result += Ch('\\'), result += Ch('f');
else if (*b == Ch('\n')) result += Ch('\\'), result += Ch('n');
else if (*b == Ch('\r')) result += Ch('\\'), result += Ch('r');
else if (*b == Ch('/')) result += Ch('\\'), result += Ch('/');
else if (*b == Ch('"')) result += Ch('\\'), result += Ch('"');
else if (*b == Ch('\\')) result += Ch('\\'), result += Ch('\\');
else
{
const char * hexdigits = "0123456789ABCDEF";
typedef typename make_unsigned<Ch>::type UCh;
unsigned long u = static_cast<unsigned long>(static_cast<UCh>(*b));
if (u <= 0xFFFF)
{
int d1 = u / 4096; u -= d1 * 4096;
int d2 = u / 256; u -= d2 * 256;
int d3 = u / 16; u -= d3 * 16;
int d4 = u;
result += Ch('\\'); result += Ch('u');
result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
}
else
{
u = (((static_cast<unsigned long>(static_cast<UCh>(*b)) - 0x10000) >> 10) & 0x3ff) + 0xd800;
int d1 = u / 4096; u -= d1 * 4096;
int d2 = u / 256; u -= d2 * 256;
int d3 = u / 16; u -= d3 * 16;
int d4 = u;
result += Ch('\\'); result += Ch('u');
result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
u = ((static_cast<unsigned long>(static_cast<UCh>(*b)) - 0x10000) & 0x3ff) + 0xdc00;
d1 = u / 4096; u -= d1 * 4096;
d2 = u / 256; u -= d2 * 256;
d3 = u / 16; u -= d3 * 16;
d4 = u;
result += Ch('\\'); result += Ch('u');
result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
}
}
++b;
}
return result;
}