我在C中有一个char *
字符串,它基于用户输入。从这个字符串开始,我想从第一个位置开始选择一个子串,这样得到的子串在固定宽度的终端上是n列宽。
过去从未使用过非ASCII字符,我完全不知道如何处理这个问题,甚至开始。一些初步搜索建议使用libiconv
,但似乎没有帮助。我也尝试使用wchar.h
,广泛的角色支持,但我不确定这是正确的方法。
编辑:这是我在第一次尝试中尝试的内容:
static int
count_n_cols (const char *mbs, char *mbf, const int n)
{
wchar_t wc;
int bytes;
int remaining = strlen(mbs);
int cols = 0;
int wccols;
while (*mbs != '\0' && cols <= n)
{
bytes = mbtowc (&wc, mbs, remaining);
assert (bytes != 0); /* Only happens when *mbs == '\0' */
if (bytes == -1)
{
/* Invalid sequence. We'll just have to fudge it. */
return cols + remaining;
}
mbs += bytes;
remaining -= bytes;
wccols = wcwidth(wc);
*mbf += wc;
cols += (wccols == -1? 1 : wccols);
}
return cols;
}
答案 0 :(得分:0)
如果我正确理解了您的问题,您需要计算utf-8序列的数量来执行子字符串而不进行任何转换。您可以计算与每个列相对应的字节数&#39;通过读取序列的第一个字节,如utf-8标准所规定的那样。以下是一些示例代码,基于您的示例函数和Wikipedia's UTF-8 description:
static int count_n_cols (const char *mbs, char *mbf, const int n)
{
int bytes;
int length = strlen(mbs);
int cols = 0;
for (bytes = 0; bytes < length; bytes++)
{
if (mbs[bytes] == '\0' || cols >= n)
break;
else if ((mbs[bytes] & 0x80) == 0) // the first bit is 0
{
cols++;
}
else if ((mbs[bytes] & 0xE0) == 0xC0) //the first 3 bits are 110
{
//two bytes in utf8 sequence
cols++;
bytes++;
}
else if ((mbs[bytes] & 0xF0) == 0xE0) //the first 4 bits are 1110
{
//three bytes in utf8 sequence
cols++;
bytes += 2;
else if ((mbs[bytes] & 0xF8) == 0xF0) //the first 5 bits are 11110
{
//four bytes in utf8 sequence
cols++;
bytes += 3;
}
else
{
putc(mbs[bytes],stdout);
printf(" non_ascii %d\n", mbs[bytes] & 0x80);
}
}
strncpy(mbf, mbs, bytes);
mbf[bytes] = '\0';
return cols;
}