如何在C中将Unicode代码点打印为字符?

时间:2016-07-20 21:44:56

标签: c file-io unicode utf-8

我有一个SELECT empName, salary, department FROM (SELECT t.*, DENSE_RANK() OVER (ORDER BY avg_salary) as seqnum FROM (SELECT t1.empName, t2.salary, t2.department, AVG(t2.salary) OVER (PARTITION BY t2.department) as avg_salary FROM table1 t1 INNER JOIN table2 t2 ON t1.empID = t2.empID ) t ) t WHERE seqnum = 1; 元素数组,每个元素都存储一个非拉丁Unicode字符的代码点。如何在控制台上打印它们或将它们作为UTF-8编码字符存储在文件中?我知道他们可能无法在控制台上正确呈现,但如果我在兼容的编辑器中打开它们,它们应该显示正常。

我尝试过使用uint32_twprintf(L"%lc", UINT32_T_VARIABLE),但无济于事。

2 个答案:

答案 0 :(得分:1)

您必须先选择正确的区域设置:

#include <locale.h>

setlocale(LC_ALL, "C.UTF-8");

setlocale(LC_ALL, "en_US.UTF-8");

然后以printf格式使用fprintf%lc

printf("%lc", UINT32_T_VARIABLE);

这仅适用于小到足以适合wchar_t的Unicode代码点。对于更完整和可移植的解决方案,您可能需要自己实现Unicode到UTF-8的转换,这不是很困难。

答案 1 :(得分:0)

最好在可用时使用现有代码。

滚动自己的Unicode代码点到UTF8很简单,但很容易搞砸。答案需要2次修改才能修复。 @Jonathan Leffler @chqrlie,因此建议对任何自编码解决方案进行严格测试。以下是经过轻度测试的代码,用于将代码点转换为数组 请注意,结果不是字符串

// Populate utf8 with 0-4 bytes
// Return length used in utf8[]
// 0 implies bad codepoint
unsigned Unicode_CodepointToUTF8(uint8_t *utf8, uint32_t codepoint) {
  if (codepoint <= 0x7F) {
    utf8[0] = codepoint;
    return 1;
  }
  if (codepoint <= 0x7FF) {
    utf8[0] = 0xC0 | (codepoint >> 6);
    utf8[1] = 0x80 | (codepoint & 0x3F);
    return 2;
  }
  if (codepoint <= 0xFFFF) {
    // detect surrogates
    if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return 0;
    utf8[0] = 0xE0 | (codepoint >> 12);
    utf8[1] = 0x80 | ((codepoint >> 6) & 0x3F);
    utf8[2] = 0x80 | (codepoint & 0x3F);
    return 3;
  }
  if (codepoint <= 0x10FFFF) {
    utf8[0] = 0xF0 | (codepoint >> 18);
    utf8[1] = 0x80 | ((codepoint >> 12) & 0x3F);
    utf8[2] = 0x80 | ((codepoint >> 6) & 0x3F);
    utf8[3] = 0x80 | (codepoint & 0x3F);
    return 4;
  }
  return 0;
}

// Sample usage
uint32_t cp = foo();
uint8_t utf8[4];
unsigned len = Unicode_CodepointToUTF8(utf8, cp);
if (len == 0) Handle_BadCodePoint();
size_t y = fwrite(utf8, 1, len, stream_opened_in_binary_mode);