我有一个包含波斯语文本的文本文件:
یک
(U + 06CC,U + 06A9)。
我希望将此文件读入字符串,然后将其存储在输出文件中。 我正在使用此代码:
const process = spawn('starter.sh', []);
process.stdout.on('data', (data) => {
console.log(`stdout: ${data}`);
// prints 'Starting jarx' like expected
});
process.stderr.on('data', (data) => {
console.log(`stderr: ${data}`);
});
process.on('close', (code) => {
console.log(`child process exited with code ${code}`);
// Not called unless all the spawned java processes are dead :(
});
它不起作用。我的期望:تست
(U + 062A,U + 0633,U + 062A)。
答案 0 :(得分:0)
试试这个:
#include <locale.h>
#include <stdio.h>
#include <wchar.h>
int main()
{
FILE *input;
wchar_t buf[1000];
setlocale(LC_CTYPE,"it_IT.UTF-8"); // put your locale here
if ((input = fopen("input.txt","r")) == NULL)
return 1;
while (fgetws(buf,1000,input)!=NULL)
wprintf(L"%s",buf);
fclose(input);
}
答案 1 :(得分:0)
这只是一个小错误;对于你需要的宽字符:
fprintf(out, "%lc ", c);
-
或
fwprintf(out, L"%lc ", c);
- - -
然后,它works for me。 (此外,文件还有以空格分隔的字符。)
答案 2 :(得分:-1)
以下是我的Baby X项目中的utf-8例程。
您需要从UTF-8转换为Unicode代码点,然后转换为宽字符,然后您可以传递到需要宽字符的Windows(可能是)函数。为方便起见,我还添加了反向转换。
static const unsigned int offsetsFromUTF8[6] =
{
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const unsigned char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
int bbx_isutf8z(const char *str)
{
int len = 0;
int pos = 0;
int nb;
int i;
int ch;
while(str[len])
len++;
while(pos < len && *str)
{
nb = bbx_utf8_skip(str);
if(nb < 1 || nb > 4)
return 0;
if(pos + nb > len)
return 0;
for(i=1;i<nb;i++)
if( (str[i] & 0xC0) != 0x80 )
return 0;
ch = bbx_utf8_getch(str);
if(ch < 0x80)
{
if(nb != 1)
return 0;
}
else if(ch < 0x8000)
{
if(nb != 2)
return 0;
}
else if(ch < 0x10000)
{
if(nb != 3)
return 0;
}
else if(ch < 0x110000)
{
if(nb != 4)
return 0;
}
pos += nb;
str += nb;
}
return 1;
}
int bbx_utf8_skip(const char *utf8)
{
return trailingBytesForUTF8[(unsigned char) *utf8] + 1;
}
int bbx_utf8_getch(const char *utf8)
{
int ch;
int nb;
nb = trailingBytesForUTF8[(unsigned char)*utf8];
ch = 0;
switch (nb)
{
/* these fall through deliberately */
case 3: ch += (unsigned char)*utf8++; ch <<= 6;
case 2: ch += (unsigned char)*utf8++; ch <<= 6;
case 1: ch += (unsigned char)*utf8++; ch <<= 6;
case 0: ch += (unsigned char)*utf8++;
}
ch -= offsetsFromUTF8[nb];
return ch;
}
int bbx_utf8_putch(char *out, int ch)
{
char *dest = out;
if (ch < 0x80)
{
*dest++ = (char)ch;
}
else if (ch < 0x800)
{
*dest++ = (ch>>6) | 0xC0;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x10000)
{
*dest++ = (ch>>12) | 0xE0;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x110000)
{
*dest++ = (ch>>18) | 0xF0;
*dest++ = ((ch>>12) & 0x3F) | 0x80;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else
return 0;
return dest - out;
}
int bbx_utf8_charwidth(int ch)
{
if (ch < 0x80)
{
return 1;
}
else if (ch < 0x800)
{
return 2;
}
else if (ch < 0x10000)
{
return 3;
}
else if (ch < 0x110000)
{
return 4;
}
else
return 0;
}
int bbx_utf8_Nchars(const char *utf8)
{
int answer = 0;
while(*utf8)
{
utf8 += bbx_utf8_skip(utf8);
answer++;
}
return answer;
}