从命令行参数获取unicode符号

时间:2019-04-26 14:46:26

标签: c unicode

我通过命令行参数传递给程序unicode符号。

$ ./program ●

程序应返回该符号的代码。

#include <stdio.h>  

int main(int argc, char *argv[])
{
    wchar_t glyph;

    glyph = *((wchar_t *) argv[1]);
    printf("%u\n", glyph);
}

●符号的代码为9679(十六进制25cf),程序返回9410530argv[1]参数的长度为3个字节,而不是4个(unicode符号为32位),它包含8f 97 e2 \0个字节。如何获得正确的符号代码?

1 个答案:

答案 0 :(得分:0)

使用mbstowcs()将UTF-8编码的字符从多字节字符串转换为宽字符的解决方案。

#include <stdio.h>
#include <stdlib.h>
#include <locale.h>

int main(int argc, char *argv[])
{
    wchar_t u;

    /* Set locale according to the environment variables */
    if (setlocale(LC_ALL, "") == NULL) {
        perror("setlocale");
        exit(EXIT_FAILURE);
    }

    /* Convert the multibyte character string in argv[1] to a
       wide character */
    if (mbstowcs(&u, argv[1], 1) == (size_t) -1) {
        perror("mbstowcs");
        exit(EXIT_FAILURE);
    }

    printf("%u\n", u);
}

另一种解决方案是手动解码UTF-8字符。代码是从st (suckless terminal emulator)导入的。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#define UTF_INVALID   0xFFFD
#define UTF_SIZ       4

typedef unsigned char uchar;
typedef uint_least32_t Rune;

#define LEN(a)          (sizeof(a) / sizeof(a)[0])
#define BETWEEN(x, a, b)    ((a) <= (x) && (x) <= (b))

static uchar utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  0x10000};
static Rune utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};

Rune
utf8decodebyte(char c, size_t *i)
{
    for (*i = 0; *i < LEN(utfmask); ++(*i))
        if (((uchar)c & utfmask[*i]) == utfbyte[*i])
            return (uchar)c & ~utfmask[*i];

    return 0;
}

size_t
utf8validate(Rune *u, size_t i)
{
    if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
        *u = UTF_INVALID;
    for (i = 1; *u > utfmax[i]; ++i)
        ;

    return i;
}

size_t
utf8decode(const char *c, Rune *u, size_t clen)
{
    size_t i, j, len, type;
    Rune udecoded;

    *u = UTF_INVALID;
    if (!clen)
        return 0;
    udecoded = utf8decodebyte(c[0], &len);
    if (!BETWEEN(len, 1, UTF_SIZ))
        return 1;
    for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
        udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
        if (type != 0)
            return j;
    }
    if (j < len)
        return 0;
    *u = udecoded;
    utf8validate(u, len);

    return len;
}


int main(int argc, char *argv[])
{
    Rune u;

    utf8decode(argv[1], &u, UTF_SIZ);
    printf("%u\n", u);
}