考虑:
STDMETHODIMP CFileSystemAPI::setRRConfig( BSTR config_str, VARIANT* ret )
{
mReportReaderFactory.reset( new sbis::report_reader::ReportReaderFactory() );
USES_CONVERSION;
std::string configuration_str = W2A( config_str );
但是在config_str中我得到一个UTF-16字符串。如何在这段代码中将其转换为UTF-8?
答案 0 :(得分:2)
如果您使用的是C ++ 11,可以查看:
http://www.cplusplus.com/reference/codecvt/codecvt_utf8_utf16/
答案 1 :(得分:1)
你可以做这样的事情
std::string WstrToUtf8Str(const std::wstring& wstr)
{
std::string retStr;
if (!wstr.empty())
{
int sizeRequired = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, NULL, 0, NULL, NULL);
if (sizeRequired > 0)
{
std::vector<char> utf8String(sizeRequired);
int bytesConverted = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(),
-1, &utf8String[0], utf8String.size(), NULL,
NULL);
if (bytesConverted != 0)
{
retStr = &utf8String[0];
}
else
{
std::stringstream err;
err << __FUNCTION__
<< " std::string WstrToUtf8Str failed to convert wstring '"
<< wstr.c_str() << L"'";
throw std::runtime_error( err.str() );
}
}
}
return retStr;
}
您可以将您的BSTR作为std :: wstring
提供给该函数答案 2 :(得分:1)
我在 UTF-8<->UTF-16<->UTF-32 之间实现了两种转换变体,第一个变体从头开始完全实现所有转换,第二个使用标准 std::codecvt 和std::wstring_convert(这两个类从 C++17 开始被弃用,但仍然存在,也保证在 C++11/C++14 中存在)。
如果你不喜欢我的代码,那么你可以使用几乎单头的 C++ 库 utfcpp,它应该已经被许多客户很好地测试过。
要将 UTF-8 转换为 UTF-16 只需调用 Utf32To16(Utf8To32(str))
,将 UTF-16 转换为 UTF-8 调用 Utf32To8(Utf16To32(str))
。或者您可以使用我方便的辅助函数 UtfConv<std::wstring>(std::string("abc"))
将 UTF-8 转换为 UTF-16 或 UtfConv<std::string>(std::wstring(L"abc"))
将 UTF-16 转换为 UTF-8,UtfConv
实际上可以从任何转换为任何 Utf - 编码的字符串。在 Test(cs)
宏中查看这些和其他用法的示例。
两种变体都符合 C++11。此外,它们还可以在 CLang/GCC/MSVC 编译器中编译(请参阅下面的“在线试用!”链接),并在 Windows/Linux 操作系统中进行了测试。
您必须使用 UTF-8 编码将我的两个代码片段保存在文件中,并向 CLang/GCC 提供选项 -finput-charset=UTF-8 -fexec-charset=UTF-8
,向 MSVC 提供选项 /utf-8
。仅当您使用非 ascii 字符放置文字字符串时,才需要这种 utf-8 保存和选项,就像我在代码中所做的那样,仅用于测试目的。要使用函数本身,您不需要这个 utf-8 保存和选项。
包含 <windows.h>
和 <clocale>
和 <iostream>
,也调用 SetConsoleOutputCP(65001)
和 std::setlocale(LC_ALL, "en_US.UTF-8")
仅用于测试目的,以正确设置和输出到 UTF -8 控制台。转换函数不需要这些东西。
部分代码不是很必要,我指的是UtfHelper相关的结构和函数,它们只是转换的辅助函数,主要是为了跨平台处理std::wstring
而创建的,因为wchar_t
是通常在 Linux 上为 32 位,在 Windows 上为 16 位。只有低级函数 Utf8To32
、Utf32To8
、Utf16To32
、Utf32To16
才是转换真正需要的东西。
变体 1 是根据 UTF-8 和 UTF-16 编码的维基百科描述创建的。
如果您发现错误或任何改进(尤其是在变体 1 中),请告诉我,我会修复它们。
变体 1
#include <string>
#include <iostream>
#include <stdexcept>
#include <type_traits>
#include <cstdint>
#ifdef _WIN32
#include <windows.h>
#else
#include <clocale>
#endif
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg: " + std::string(msg)); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
template <typename U8StrT = std::string>
inline static U8StrT Utf32To8(std::u32string const & s) {
static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");
typedef typename U8StrT::value_type VT;
typedef uint8_t u8;
U8StrT r;
for (auto c: s) {
size_t nby = c <= 0x7FU ? 1 : c <= 0x7FFU ? 2 : c <= 0xFFFFU ? 3 : c <= 0x1FFFFFU ? 4 : c <= 0x3FFFFFFU ? 5 : c <= 0x7FFFFFFFU ? 6 : 7;
r.push_back(VT(
nby <= 1 ? u8(c) : (
(u8(0xFFU) << (8 - nby)) |
u8(c >> (6 * (nby - 1)))
)
));
for (size_t i = 1; i < nby; ++i)
r.push_back(VT(u8(0x80U | (u8(0x3FU) & u8(c >> (6 * (nby - 1 - i)))))));
}
return r;
}
template <typename U8StrT>
inline static std::u32string Utf8To32(U8StrT const & s) {
static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");
typedef uint8_t u8;
std::u32string r;
auto it = (u8 const *)s.c_str(), end = (u8 const *)(s.c_str() + s.length());
while (it < end) {
char32_t c = 0;
if (*it <= 0x7FU) {
c = *it;
++it;
} else {
ASSERT((*it & 0xC0U) == 0xC0U);
size_t nby = 0;
for (u8 b = *it; (b & 0x80U) != 0; b <<= 1, ++nby) {(void)0;}
ASSERT(nby <= 7);
ASSERT((end - it) >= nby);
c = *it & (u8(0xFFU) >> (nby + 1));
for (size_t i = 1; i < nby; ++i) {
ASSERT((it[i] & 0xC0U) == 0x80U);
c = (c << 6) | (it[i] & 0x3FU);
}
it += nby;
}
r.push_back(c);
}
return r;
}
template <typename U16StrT = std::u16string>
inline static U16StrT Utf32To16(std::u32string const & s) {
static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");
typedef typename U16StrT::value_type VT;
typedef uint16_t u16;
U16StrT r;
for (auto c: s) {
if (c <= 0xFFFFU)
r.push_back(VT(c));
else {
ASSERT(c <= 0x10FFFFU);
c -= 0x10000U;
r.push_back(VT(u16(0xD800U | ((c >> 10) & 0x3FFU))));
r.push_back(VT(u16(0xDC00U | (c & 0x3FFU))));
}
}
return r;
}
template <typename U16StrT>
inline static std::u32string Utf16To32(U16StrT const & s) {
static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");
typedef uint16_t u16;
std::u32string r;
auto it = (u16 const *)s.c_str(), end = (u16 const *)(s.c_str() + s.length());
while (it < end) {
char32_t c = 0;
if (*it < 0xD800U || *it > 0xDFFFU) {
c = *it;
++it;
} else if (*it >= 0xDC00U) {
ASSERT_MSG(false, "Unallowed UTF-16 sequence!");
} else {
ASSERT(end - it >= 2);
c = (*it & 0x3FFU) << 10;
if ((it[1] < 0xDC00U) || (it[1] > 0xDFFFU)) {
ASSERT_MSG(false, "Unallowed UTF-16 sequence!");
} else {
c |= it[1] & 0x3FFU;
c += 0x10000U;
}
it += 2;
}
r.push_back(c);
}
return r;
}
template <typename StrT, size_t NumBytes = sizeof(typename StrT::value_type)> struct UtfHelper;
template <typename StrT> struct UtfHelper<StrT, 1> {
inline static std::u32string UtfTo32(StrT const & s) { return Utf8To32(s); }
inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To8<StrT>(s); }
};
template <typename StrT> struct UtfHelper<StrT, 2> {
inline static std::u32string UtfTo32(StrT const & s) { return Utf16To32(s); }
inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To16<StrT>(s); }
};
template <typename StrT> struct UtfHelper<StrT, 4> {
inline static std::u32string UtfTo32(StrT const & s) {
return std::u32string((char32_t const *)(s.c_str()), (char32_t const *)(s.c_str() + s.length()));
}
inline static StrT UtfFrom32(std::u32string const & s) {
return StrT((typename StrT::value_type const *)(s.c_str()),
(typename StrT::value_type const *)(s.c_str() + s.length()));
}
};
template <typename StrT> inline static std::u32string UtfTo32(StrT const & s) {
return UtfHelper<StrT>::UtfTo32(s);
}
template <typename StrT> inline static StrT UtfFrom32(std::u32string const & s) {
return UtfHelper<StrT>::UtfFrom32(s);
}
template <typename StrToT, typename StrFromT> inline static StrToT UtfConv(StrFromT const & s) {
return UtfFrom32<StrToT>(UtfTo32(s));
}
#define Test(cs) \
std::cout << Utf32To8(Utf8To32(std::string(cs))) << ", "; \
std::cout << Utf32To8(Utf16To32(Utf32To16(Utf8To32(std::string(cs))))) << ", "; \
std::cout << Utf32To8(Utf16To32(std::u16string(u##cs))) << ", "; \
std::cout << Utf32To8(std::u32string(U##cs)) << ", "; \
std::cout << UtfConv<std::string>(UtfConv<std::u16string>(UtfConv<std::u32string>(UtfConv<std::u32string>(UtfConv<std::u16string>(std::string(cs)))))) << ", "; \
std::cout << UtfConv<std::string>(UtfConv<std::wstring>(UtfConv<std::string>(UtfConv<std::u32string>(UtfConv<std::u32string>(std::string(cs)))))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::string(cs))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::u16string(u##cs))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::wstring(L##cs))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::u32string(U##cs))) << std::endl; \
std::cout << "UTF-8 num bytes: " << std::dec << Utf32To8(std::u32string(U##cs)).size() << ", "; \
std::cout << "UTF-16 num bytes: " << std::dec << (Utf32To16(std::u32string(U##cs)).size() * 2) << std::endl;
int main() {
#ifdef _WIN32
SetConsoleOutputCP(65001);
#else
std::setlocale(LC_ALL, "en_US.UTF-8");
#endif
try {
Test("World");
Test("Привет");
Test("??");
Test("?");
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
}
}
输出:
World, World, World, World, World, World, World, World, World, World
UTF-8 num bytes: 5, UTF-16 num bytes: 10
Привет, Привет, Привет, Привет, Привет, Привет, Привет, Привет, Привет, Привет
UTF-8 num bytes: 12, UTF-16 num bytes: 12
??, ??, ??, ??, ??, ??, ??, ??, ??, ??
UTF-8 num bytes: 8, UTF-16 num bytes: 8
?, ?, ?, ?, ?, ?, ?, ?, ?, ?
UTF-8 num bytes: 4, UTF-16 num bytes: 4
变体 2
#include <string>
#include <iostream>
#include <stdexcept>
#include <type_traits>
#include <locale>
#include <codecvt>
#include <cstdint>
#ifdef _WIN32
#include <windows.h>
#else
#include <clocale>
#endif
#define ASSERT(cond) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "!"); }
// Workaround for some of MSVC compilers.
#if defined(_MSC_VER) && (!_DLL) && (_MSC_VER >= 1900 /* VS 2015*/) && (_MSC_VER <= 1914 /* VS 2017 */)
std::locale::id std::codecvt<char16_t, char, _Mbstatet>::id;
std::locale::id std::codecvt<char32_t, char, _Mbstatet>::id;
#endif
template <typename U8StrT>
inline static std::u32string Utf8To32(U8StrT const & s) {
static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf_8_32_conv_;
return utf_8_32_conv_.from_bytes((char const *)s.c_str(), (char const *)(s.c_str() + s.length()));
}
template <typename U8StrT = std::string>
inline static U8StrT Utf32To8(std::u32string const & s) {
static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf_8_32_conv_;
std::string res = utf_8_32_conv_.to_bytes(s.c_str(), s.c_str() + s.length());
return U8StrT(
(typename U8StrT::value_type const *)(res.c_str()),
(typename U8StrT::value_type const *)(res.c_str() + res.length()));
}
template <typename U16StrT>
inline static std::u32string Utf16To32(U16StrT const & s) {
static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");
std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, std::little_endian>, char32_t> utf_16_32_conv_;
return utf_16_32_conv_.from_bytes((char const *)s.c_str(), (char const *)(s.c_str() + s.length()));
}
template <typename U16StrT = std::u16string>
inline static U16StrT Utf32To16(std::u32string const & s) {
static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");
std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, std::little_endian>, char32_t> utf_16_32_conv_;
std::string res = utf_16_32_conv_.to_bytes(s.c_str(), s.c_str() + s.length());
return U16StrT(
(typename U16StrT::value_type const *)(res.c_str()),
(typename U16StrT::value_type const *)(res.c_str() + res.length()));
}
template <typename StrT, size_t NumBytes = sizeof(typename StrT::value_type)> struct UtfHelper;
template <typename StrT> struct UtfHelper<StrT, 1> {
inline static std::u32string UtfTo32(StrT const & s) { return Utf8To32(s); }
inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To8<StrT>(s); }
};
template <typename StrT> struct UtfHelper<StrT, 2> {
inline static std::u32string UtfTo32(StrT const & s) { return Utf16To32(s); }
inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To16<StrT>(s); }
};
template <typename StrT> struct UtfHelper<StrT, 4> {
inline static std::u32string UtfTo32(StrT const & s) {
return std::u32string((char32_t const *)(s.c_str()), (char32_t const *)(s.c_str() + s.length()));
}
inline static StrT UtfFrom32(std::u32string const & s) {
return StrT((typename StrT::value_type const *)(s.c_str()),
(typename StrT::value_type const *)(s.c_str() + s.length()));
}
};
template <typename StrT> inline static std::u32string UtfTo32(StrT const & s) {
return UtfHelper<StrT>::UtfTo32(s);
}
template <typename StrT> inline static StrT UtfFrom32(std::u32string const & s) {
return UtfHelper<StrT>::UtfFrom32(s);
}
template <typename StrToT, typename StrFromT> inline static StrToT UtfConv(StrFromT const & s) {
return UtfFrom32<StrToT>(UtfTo32(s));
}
#define Test(cs) \
std::cout << Utf32To8(Utf8To32(std::string(cs))) << ", "; \
std::cout << Utf32To8(Utf16To32(Utf32To16(Utf8To32(std::string(cs))))) << ", "; \
std::cout << Utf32To8(Utf16To32(std::u16string(u##cs))) << ", "; \
std::cout << Utf32To8(std::u32string(U##cs)) << ", "; \
std::cout << UtfConv<std::string>(UtfConv<std::u16string>(UtfConv<std::u32string>(UtfConv<std::u32string>(UtfConv<std::u16string>(std::string(cs)))))) << ", "; \
std::cout << UtfConv<std::string>(UtfConv<std::wstring>(UtfConv<std::string>(UtfConv<std::u32string>(UtfConv<std::u32string>(std::string(cs)))))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::string(cs))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::u16string(u##cs))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::wstring(L##cs))) << ", "; \
std::cout << UtfFrom32<std::string>(UtfTo32(std::u32string(U##cs))) << std::endl; \
std::cout << "UTF-8 num bytes: " << std::dec << Utf32To8(std::u32string(U##cs)).size() << ", "; \
std::cout << "UTF-16 num bytes: " << std::dec << (Utf32To16(std::u32string(U##cs)).size() * 2) << std::endl;
int main() {
#ifdef _WIN32
SetConsoleOutputCP(65001);
#else
std::setlocale(LC_ALL, "en_US.UTF-8");
#endif
try {
Test("World");
Test("Привет");
Test("??");
Test("?");
return 0;
} catch (std::exception const & ex) {
std::cout << "Exception: " << ex.what() << std::endl;
return -1;
}
}
输出:
World, World, World, World, World, World, World, World, World, World
UTF-8 num bytes: 5, UTF-16 num bytes: 10
Привет, Привет, Привет, Привет, Привет, Привет, Привет, Привет, Привет, Привет
UTF-8 num bytes: 12, UTF-16 num bytes: 12
??, ??, ??, ??, ??, ??, ??, ??, ??, ??
UTF-8 num bytes: 8, UTF-16 num bytes: 8
?, ?, ?, ?, ?, ?, ?, ?, ?, ?
UTF-8 num bytes: 4, UTF-16 num bytes: 4
答案 3 :(得分:0)
void encode_unicode_character(char* buffer, int* offset, wchar_t ucs_character)
{
if (ucs_character <= 0x7F)
{
// Plain single-byte ASCII.
buffer[(*offset)++] = (char) ucs_character;
}
else if (ucs_character <= 0x7FF)
{
// Two bytes.
buffer[(*offset)++] = 0xC0 | (ucs_character >> 6);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
}
else if (ucs_character <= 0xFFFF)
{
// Three bytes.
buffer[(*offset)++] = 0xE0 | (ucs_character >> 12);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
}
else if (ucs_character <= 0x1FFFFF)
{
// Four bytes.
buffer[(*offset)++] = 0xF0 | (ucs_character >> 18);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 12) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
}
else if (ucs_character <= 0x3FFFFFF)
{
// Five bytes.
buffer[(*offset)++] = 0xF8 | (ucs_character >> 24);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 18) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 12) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
}
else if (ucs_character <= 0x7FFFFFFF)
{
// Six bytes.
buffer[(*offset)++] = 0xFC | (ucs_character >> 30);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 24) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 18) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 12) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 6) & 0x3F);
buffer[(*offset)++] = 0x80 | ((ucs_character >> 0) & 0x3F);
}
else
{
// Invalid char; don't encode anything.
}
}
ISO10646-2012只需了解UCS即可。