Question

有没有办法在Linux下用C读取文本文件，在Windows上用记事本保存为“UNICODE”？使用nano编辑器的Linux中的文本如下所示：

��T^@e^@s^@t^@
^@

但在vi编辑器下正确读取：

Test

我必须指定文本是正常的字符串ANSI（没有Unicode字符或外语相关）。试过这样但没有结果：

#include <stdio.h>
#include <wchar.h>
#include <locale.h>

int main() {
   char *loc = setlocale(LC_ALL, 0);
   setlocale(LC_ALL, loc);
   FILE * f = fopen("unicode.txt", "r");
   wint_t c;

   while((c = fgetwc(f)) != WEOF) {
      wprintf(L"%lc\n", c);
   }
   return 0;
}

更新：

忘记提及文件格式为Little-endian UTF-16 Unicode text或UTF-16LE

Answer 1

包含<wchar.h>，设置UTF-8语言环境（setlocale(LC_ALL, "en_US.UTF-8")没问题），以字节为导向模式打开文件或流（handle=fopen(filename, "rb")，fwide(handle,-1)，即在非宽模式下）。然后你可以使用

wint_t getwc_utf16le(FILE *const in)
{
    int lo, hi, code, also;

    if ((lo = getc(in)) == EOF)
        return WEOF;

    if ((hi = getc(in)) == EOF)
        return lo; /* Or abort; input sequence ends prematurely */

    code = lo + 256 * hi;
    if (code < 0xD800 || code > 0xDBFF)
        return code; /* Or abort; input sequence is not UTF16-LE */

    if ((lo = getc(in)) == EOF)
        return code; /* Or abort; input sequence ends prematurely */

    if ((hi = getc(in)) == EOF) {
        ungetc(lo, in);
        return code; /* Or abort; input sequence ends prematurely */
    }

    /* Note: if ((lo + 256*hi) < 0xDC00 || (lo + 256*hi) > 0xDFFF)
     *       the input sequence is not valid UTF16-LE. */
    return 0x10000 + ((code & 0x3FF) << 10) + ((lo + 256 * hi) & 0x3FF);
}

从这样的输入文件中读取代码点，假设它包含UTF16-LE数据。

上面的函数比严格必要的更宽松，但它确实解析了我可以抛出的所有UTF16-LE（包括有时有问题的U + 100000..U + 10FFFF代码点），所以如果输入正确，这个功能应该处理得很好。

由于Linux中的语言环境设置为UTF-8，并且Linux实现支持完整的Unicode集，因此代码点与上述函数生成的代码点匹配，您可以安全地使用宽字符函数（来自<wchar.h> ）处理输入。

文件中的第一个字符通常是BOM，“字节顺序标记”，0xFEFF。如果它是文件中的第一个字符，则可以忽略它。在其他地方，它是零宽度不间断空间。根据我的经验，文件开头的那两个字节应该是文本，是文件是UTF16-LE的非常可靠的指示。（所以，您可以查看前两个字节，如果它们匹配，则假设它是UTF16-LE。）

请记住，宽字符结尾是WEOF，而不是EOF。

希望这有帮助。

编辑20150505：这是一个可以使用的辅助函数，用于读取输入（使用低级unistd.h接口），转换为UTF-8：read_utf8.h：

#ifndef   READ_UTF8_H
#define   READ_UTF8_H

/* Read input from file descriptor fd,
 * convert it to UTF-8 (using "UTF8//TRANSLIT" iconv conversion),
 * and appending to the specified buffer.
 *    (*dataptr)   points to a dynamically allocated buffer (may reallocate),
 *    (*sizeptr)   points to the size allocated for that buffer,
 *    (*usedptr)   points to the amount of data already in the buffer.
 * You may initialize the values to NULL,0,0, in which case they will
 * be dynamically allocated as needed.
*/
int read_utf8(char **dataptr, size_t *sizeptr, size_t *usedptr, const int fd, const char *const charset);

#endif /* READ_UTF8_H */

read_utf8.c：

#define  _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iconv.h>
#include <string.h>
#include <errno.h>

#define   INPUT_CHUNK  16384
#define   OUTPUT_CHUNK  8192

int read_utf8(char **dataptr, size_t *sizeptr, size_t *usedptr, const int fd, const char *const charset)
{
    char    *data;
    size_t   size;
    size_t   used;

    char    *input_data;
    size_t   input_size, input_head, input_tail;
    int      input_more;

    iconv_t  conversion = (iconv_t)-1;

    if (!dataptr || !sizeptr || !usedptr || fd == -1 || !charset || !*charset)
        return errno = EINVAL;

    if (*dataptr) {
        data = *dataptr;
        size = *sizeptr;
        used = *usedptr;
        if (used > size)
            return errno = EINVAL;
    } else {
        data = NULL;
        size = 0;
        used = 0;
    }

    conversion = iconv_open("UTF8//TRANSLIT", charset);
    if (conversion == (iconv_t)-1)
        return errno = ENOTSUP;

    input_size = INPUT_CHUNK;
    input_data = malloc(input_size);
    if (!input_data) {
        if (conversion != (iconv_t)-1)
            iconv_close(conversion);
        errno = ENOMEM;
        return 0;
    }
    input_head = 0;
    input_tail = 0;
    input_more = 1;

    while (1) {

        if (input_tail > input_head) {
            if (input_head > 0) {
                memmove(input_data, input_data + input_head, input_tail - input_head);
                input_tail -= input_head;
                input_head  = 0;
            }
        } else {
            input_head = 0;
            input_tail = 0;
        }

        if (input_more && input_tail < input_size) {
            ssize_t n;

            do {
                n = read(fd, input_data + input_tail, input_size - input_tail);
            } while (n == (ssize_t)-1 && errno == EINTR);

            if (n > (ssize_t)0)
                input_tail += n;
            else
            if (n == (ssize_t)0)
                input_more = 0;
            else
            if (n != (ssize_t)-1) {
                free(input_data);
                iconv_close(conversion);
                return errno = EIO;
            } else {
                const int errcode = errno;
                free(input_data);
                iconv_close(conversion);
                return errno = errcode;
            }
        }

        if (input_head == 0 && input_tail == 0)
            break;

        if (used + OUTPUT_CHUNK > size) {
            size = (used / (size_t)OUTPUT_CHUNK + (size_t)2) * (size_t)OUTPUT_CHUNK;
            data = realloc(data, size);
            if (!data) {
                free(input_data);
                iconv_close(conversion);
                return errno = ENOMEM;
            }
            *dataptr = data;
            *sizeptr = size;
        }

        {
            char   *source_ptr = input_data + input_head;
            size_t  source_len = input_tail - input_head;

            char   *target_ptr = data + used;
            size_t  target_len = size - used;

            size_t  n;

            n = iconv(conversion, &source_ptr, &source_len, &target_ptr, &target_len);
            if (n == (size_t)-1 && errno == EILSEQ) {
                free(input_data);
                iconv_close(conversion);
                return errno = EILSEQ;
            }

            if (source_ptr == input_data + input_head && target_ptr == data + used) {
                free(input_data);
                iconv_close(conversion);
                return errno = EDEADLK;
            }

            input_head = (size_t)(source_ptr - input_data);
            used = (size_t)(target_ptr - data);

            *usedptr = used;
        }
    }

    free(input_data);
    iconv_close(conversion);

    if (used + 16 >= size) {
        size = (used | 15) + 17;
        data = realloc(data, size);
        if (!data)
            return errno = ENOMEM;
        *dataptr = data;
        *sizeptr = size;
        memset(data + used, 0, size - used);
    } else
    if (used + 32 < size)
        memset(data + used, 0, size - used);
    else
        memset(data + used, 0, 32);

    return errno = 0;
}

以及一个示例程序example.c，关于如何使用它：

#define  _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include "read_utf8.h"

int main(int argc, char *argv[])
{
    char   *file_buffer = NULL;
    size_t  file_allocd = 0;
    size_t  file_length = 0;
    int     fd;

    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME CHARSET\n", argv[0]);
        fprintf(stderr, "       %s FILENAME CHARSET//IGNORE\n", argv[0]);
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    do {
        fd = open(argv[1], O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }

    if (read_utf8(&file_buffer, &file_allocd, &file_length, fd, argv[2])) {
        if (errno == ENOTSUP)
            fprintf(stderr, "%s: Unsupported character set.\n", argv[2]);
        else
            fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }

    errno = EIO;
    if (close(fd)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }

    fprintf(stderr, "%s: read %zu bytes, allocated %zu.\n", argv[1], file_length, file_allocd);
    if (file_length > 0)
        if (fwrite(file_buffer, file_length, 1, stdout) != 1) {
            fprintf(stderr, "Error writing to standard output.\n");
            return EXIT_FAILURE;
        }

    return EXIT_SUCCESS;
}

这使您可以使用系统支持的任何字符集读取（进入空的，动态分配的缓冲区或附加到现有的动态分配的缓冲区）（使用iconv --list查看列表），自动转换内容为UTF-8。

它使用一个临时输入缓冲区（INPUT_CHUNK个字节）逐个读取文件，并以OUTPUT_CHUNK个字节的倍数重新分配输出缓冲区，保持至少OUTPUT_CHUNK个字节适用于每次转化。常量可能需要针对不同的用例进行一些调整;它们绝不是最佳的，甚至不是建议的价值。较大的代码会导致更快的代码，特别是对于INPUT_CHUNK，因为大多数文件系统在读取大块时表现更好（如果I / O性能很重要，当前建议的大小为2097152） - 但是你应该{ {1}}具有相似的大小，或者可能是其两倍，以减少所需的重新分配数量。（您可以使用OUTPUT_CHUNK将生成的缓冲区调整为used+1个字节，以避免内存浪费。）

Linux C读取文件UNICODE格式化文本（记事本Windows）

1 个答案: