Question

我正在尝试在给定以下八进制序列\303\255和\346\234\254的情况下在utf8中输出正确的字符，但是我没有得到正确的输出。

#include <stdio.h>
#include <stdlib.h>

int encode(char *buf, unsigned char ch){
    if(ch < 0x80) {
        *buf++ = (char)ch;
        return 1;
    }
    if(ch < 0x800) {
        *buf++ = (ch >> 6) | 0xC0;
        *buf++ = (ch & 0x3F) | 0x80;
        return 2;
    }
    if(ch < 0x10000) {
        *buf++ = (ch >> 12) | 0xE0;
        *buf++ = ((ch >> 6) & 0x3F) | 0x80;
        *buf++ = (ch & 0x3F) | 0x80;
        return 3;
    }
    if(ch < 0x110000) {
        *buf++ = (ch >> 18) | 0xF0;
        *buf++ = ((ch >> 12) & 0x3F) | 0x80;
        *buf++ = ((ch >> 6) & 0x3F) | 0x80;
        *buf++ = (ch & 0x3F) | 0x80;
        return 4;
    }
    return 0;
}

void output (char *str) {
    char *buffer = calloc(8, sizeof(char));
    int n = 0;
    while(*str) {
        n = encode(buffer + n, *str++);
    }   
    printf("%s\n", buffer);
    free (buffer);
}

int main() {
    char *str1 = "\303\255";
    char *str2 = "\346\234\254";
    output(str1);
    output(str2);   

    return 0;
}

输出：Ã＆amp; æ¬代替í＆amp; 本

Answer 1

问题是您使用的代码序列已经是UTF-8

[   
    {
        "id": 2063,
        "text": "test",
        "position": {
            "lat": 43.357048,
            "lon": 27.9815636
        }
    },
    {
        "id": 2563,
        "text": "test2",
        "position": {
            "lat": 43.3570175,
            "lon": 27.9816666
        }
    },
    {
        "id": 2538,
        "text": "test3",
        "position": {
            "lat": 43.3092232,
            "lon": 27.97827
        }
    }
]

因此，您的编码功能正在尝试编码已编码的UTF-8，该UTF-8不起作用。

当我在支持UTF-8的终端上打印这些序列时，我会看到您期望看到的内容：

/* Both of these are already UTF-8 chars. */
char *str1 = "\303\255";
char *str2 = "\346\234\254";

所以，如果你遇到新问题，也许你需要重新考虑你想要完成的事情并发布一个新问题。

Answer 2

很遗憾，但您无法将char值（signed或unsigned）与超过0x100的值进行比较。如果您尝试将一个字节（iso-8859-1）值转换为utf-8，则会丢失一些内容。 iso-8859-1字符与UTF对应字符具有相同的代码值，因此转换相当简单，如下所示。

首先，所有iso-8859-1字符都与它们的UTF对应字符相同，因此第一个转换是标识：我们将iso-8859-1中的每个值转换为UTF中的相同值（看看当我说UTF y表示该字符的UTF代码时，不使用任何编码，就像我说的UTF-8，它实际上是8位字节的UTF编码）

范围0x80...0xff中的UTF值必须使用两个字节编码，第一个字节使用位7和6，模式110000xx为xx两个输入代码的最高有效位，后跟第二个字节，10xxxxxx为xxxxxx输入代码的六个最低有效位（位5到0）。对于0x00...0x7f范围内的UTF值，您只需使用与UTF代码相同的字节对它们进行编码。

以下功能确实可以解决这个问题：

size_t iso2utf( unsigned char *buf, unsigned char iso ) { size_t res = 0; if ( iso & 0x80 ) { *buf++ = 0xc0 | (iso >> 6); /* the 110000xx part */ *buf++ = 0x80 | (iso & 0x3f); /* ... and the 10xxxxxx part. */ res += 2; } else { *buf++ = iso; /* a 0xxxxxxx character, untouched. */ res++; } *buf = '\0'; return res; } /* iso2utf */

如果你想要一个完整的UTF到UTF-8编码器，你可以尝试这个（我使用了不同的方法，因为每个UTF字符可以有多达7个字节 - 实际上不是那么多，因为目前只有24个或使用25位代码）：

#include <string.h> #include <stdlib.h> typedef unsigned int UTF; /* you can use wchar_t if you prefer */ typedef unsigned char BYTE; /* I will assume that UTF string is also zero terminated */ size_t utf_utf8 (BYTE *out, UTF *in) { size_t res = 0; for (;*in;in++) { UTF c = *in; /* copy the UTF value */ /* we are constructing the string backwards, so finally * we have it properly ordered. */ size_t n = 0; /* number of characters for this one */ BYTE aux[7], /* buffer to construct the string */ *p = aux + sizeof aux; /* point one cell past the end */ static UTF limits[] = { 0x80, 0x20, 0x10, 0x08, 0x4, 0x2, 0x01}; static UTF masks[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe}; for (;c >= limits[n]; c >>= 6) { *--p = 0x80 | (c & 0x3f); n++; } /* for */ *--p = masks[n] | c; n++; memcpy(out, p, n); out += n; res += n; } /* for */ *out = '\0'; /* terminate string */ return res; } /* utf_utf8 */

看到每个UTF代码的七个字节是硬连线的，因为UTF代码是32位整数的事实。我不希望UTF代码超过32位限制，但在这种情况下，UTF typedef以及表aux的大小和内容{{1} }和limits可能会相应更改。对于用于utf-8编码的字符数，最大限制为7或8，并且在标准中没有以任何形式指定如果UTF代码空间应该用完的话如何继续随时编码，所以最好不要过多地使用它。

Answer 3

无用的函数参数：unsigned char ch

/// In the following bad code, `if(ch < 0x10000)` is never true
int encode(char *buf, unsigned char ch){
    if(ch < 0x80) {
      ...
      return 1;
    if(ch < 0x800) {
      ...
      return 2;
    if(ch < 0x10000) {

抱歉，GTG。

注意：代码错误地检测不到高和低代理。

如何从C语言中的八进制ISO-8859-1转储utf8

3 个答案: