Question

新编辑： 基本上我提供了一个不正确的例子。在我的实际应用程序中，字符串当然不总是“C：/ Users / Familjen-Styren / Documents / V \ u00E5gformer / 20140104-0002 / text.txt”。相反，我将在java中有一个输入窗口，然后我将“转义”将unicode字符转换为通用字符名称。然后它将在C “unescaped”（我这样做是为了避免将多字节字符从java传递到c）。所以这里有一个例子，我实际上要求用户输入一个字符串（文件名）：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char src[100];
   scanf("%s", &src);
   printf("%s\n", src);
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists);
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

现在它会认为通用字符名称只是实际文件名的一部分。那么我如何“取消”输入中包含的通用字符名称呢？

第一次编辑： 所以我像这样编译这个例子：“gcc -std = c99 read.c”其中'read.c'是我的源文件。我需要-std = c99参数，因为我使用前缀'\ u'作为我的通用字符名称。如果我将其更改为'\ x'它工作正常，我可以删除-std = c99参数。但在我的实际应用程序中，输入将不使用前缀'\ x'，而是使用前缀'\ u'。那么我该如何解决这个问题？

此代码提供了所需的结果，但对于我的实际应用程序，我无法真正使用'\ x'：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/V\x00E5gformer/20140104-0002/text.txt";
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists);
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

ORIGINAL： 我在其他编程语言（如javascript中找到了一些如何执行此操作的示例，但我找不到任何关于如何在C中执行此操作的示例。以下是一个产生相同错误的示例代码：< / p>

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt";
   int len = strlen(src); /* This returns 68. */
   char fname[len];
   sprintf(fname,"%s", src);
   int exists = func((const char*) src);
   printf("%s\n", fname);
   printf("Does the file exist? %d\n", exists); /* Outputs 'Does the file exist? 0' which means it doesn't exist. */
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

如果我改为使用没有通用字符名称的相同字符串：

#include <stdio.h>
#include <string.h>

int func(const char *fname);

int main()
{
   char *src = "C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text.txt";
   int exists = func((const char*) src);
   printf("Does the file exist? %d\n", exists); /* Outputs 'Does the file exist? 1' which means it does exist. */
   return exists;
}

int func(const char *fname)
{
    FILE *file;
    if (file = fopen(fname, "r"))
    {
        fclose(file);
        return 1;
    }
    return 0;
}

会输出'文件是否存在？ 1。这意味着它确实存在。但问题是我需要能够处理普遍性。那么我如何取消包含通用字符名称的字符串？

提前致谢。

Answer 1

错误的数组大小（忘记.txt和\0并且编码的非ASCII字符占用超过1个字节。）

// length of the string without the universal character name. 
// C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text
// 123456789012345678901234567890123456789012345678901234567890123
//          1         2         3         4         5         6
// int len = 63;

// C:/Users/Familjen-Styren/Documents/Vågformer/20140104-0002/text.txt
int len = 100;


char *src = "C:/Users/Familjen-Styren/Documents/V\u00E5gformer/20140104-0002/text.txt";
char fname[len];
// or if you can use VLA
char fname[strlen(src)+1];

sprintf(fname, "%s", src);

Answer 2

我正在回答这个问题，希望能让它更加清晰。首先，我假设你熟悉这个：http://www.joelonsoftware.com/articles/Unicode.html。处理字符编码时需要背景知识。

现在我开始使用我在linux机器上输入的简单测试程序test.c

#include <stdio.h>
#include <string.h>
#include <wchar.h>
#define BUF_SZ 255
void test_fwrite_universal(const char *fname)
{
    printf("test_fwrite_universal on %s\n", fname);
    printf("In memory we have %d bytes: ", strlen(fname));
    for (unsigned i=0; i<strlen(fname); ++i) {
        printf("%x ", (unsigned char)fname[i]);
    }
    printf("\n");

    FILE* file = fopen(fname, "w");
    if (file) {
        fwrite((const void*)fname, 1, strlen(fname),  file);        
        fclose(file);
        file = NULL;
        printf("Wrote to file successfully\n");
    }
}

int main()
{
    test_fwrite_universal("file_\u00e5.txt");
    test_fwrite_universal("file_å.txt");   
    test_fwrite_universal("file_\u0436.txt");   
    return 0;
}

文本文件编码为UTF-8。在我的linux机器上，我的语言环境是en_US.UTF-8 所以我编译并运行这样的程序：

gcc -std = c99 test.c -fexec-charset = UTF-8 -o test

<强>测试

test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 
Wrote to file successfully
test_fwrite_universal on file_å.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 
Wrote to file successfully
test_fwrite_universal on file_ж.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74 
Wrote to file successfully

文本文件是UTF-8，我的语言环境是UTF-8，char的执行字符集是UTF-8。在main中，我用字符串调用函数fwrite 3次。该函数逐字节打印字符串。然后写一个具有该名称的文件，并将该字符串写入文件。

我们可以看到“file_ \ u00e5.txt”和“file_å.txt”是相同的：66 69 6c 65 5f c3 a5 2e 74 78 74 并且肯定（http://www.fileformat.info/info/unicode/char/e5/index.htm）代码点+ 00E5的UTF-8表示是：c3 a5 在最后一个例子中，我使用了\ u0436，这是一个俄罗斯字符ж（UTF-8 d0 b6）

现在让我们在我的Windows机器上尝试相同的操作。在这里我使用mingw并执行相同的代码：

C：\ test＆gt; gcc -std = c99 test.c -fexec-charset = UTF-8 -o test.exe

<强> C：\试验＆gt;试验

test_fwrite_universal on file_├Ñ.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_├Ñ.txt
In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74
Wrote to file successfully
test_fwrite_universal on file_╨╢.txt
In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74
Wrote to file successfully

所以看起来有些事情发生了可怕的错误，printf没有正确地写字符，而且磁盘上的文件也看起来不对。值得注意的两件事：就字节值而言，linux和windows中的文件名是相同的。使用notepad ++

打开时，文件内容也正确

问题的原因是Windows上的C标准库和语言环境。在Linux上，系统区域设置是Windows上的UTF-8，我的默认语言环境是CP-437。当我调用诸如printf fopen之类的函数时，它假设输入在CP-437中， c3 a5 实际上是两个字符。

在我们查看正确的Windows解决方案之前，让我们尝试解释为什么您在file_å.txt vs file_\u00e5.txt中有不同的结果。我相信关键是文本文件的编码。如果我在CP-437中写相同的test.c：

C：\ test＆gt; iconv -f UTF-8 -t cp437 test.c＆gt; test_lcl.c

C：\ test＆gt; gcc -std = c99 test_lcl.c -fexec-charset = UTF-8 -o test_lcl.exe

<强> C：\试验＆gt; test_lcl

test_fwrite_universal on file_├Ñ.txt In memory we have 11 bytes: 66 69 6c 65 5f c3 a5 2e 74 78 74 Wrote to file successfully test_fwrite_universal on file_å.txt In memory we have 10 bytes: 66 69 6c 65 5f 86 2e 74 78 74 Wrote to file successfully test_fwrite_universal on file_╨╢.txt In memory we have 11 bytes: 66 69 6c 65 5f d0 b6 2e 74 78 74 Wrote to file successfully

我现在得到file_å和file_ \ u00e5之间的区别。文件中的字符å实际上编码为0x86。请注意，这次第二个字符串长度为10个字符而不是11个字符。如果我们查看文件并告诉Notepad ++使用UTF-8，我们将看到一个有趣的结果。写入文件的实际数据也是如此。

最后如何让该死的东西在窗户上工作。不幸的是，似乎不可能使用带有UTF-8编码字符串的标准库。在Windows上，您无法将C语言环境设置为该语言环境。见：What is the Windows equivalent for en_US.UTF-8 locale?。

但是我们可以用宽字符解决这个问题：

#include <stdio.h> #include <string.h> #include <windows.h> #define BUF_SZ 255 void test_fopen_windows(const char *fname) { wchar_t buf[BUF_SZ] = {0}; int sz = MultiByteToWideChar(CP_UTF8, 0, fname, strlen(fname), (LPWSTR)buf, BUF_SZ-1); wprintf(L"converted %d characters\n", sz); wprintf(L"Converting to wide characters %s\n", buf); FILE* file =_wfopen(buf, L"w"); if (file) { fwrite((const void*)fname, 1, strlen(fname), file); fclose(file); wprintf(L"Wrote file %s successfully\n", buf); } } int main() { test_fopen_windows("file_\u00e5.txt"); return 0; }

编译使用：

gcc -std = gnu99 -fexec-charset = UTF-8 test_wide.c -o test_wide.exe

_wfopen不符合ANSI标准且-std = c99实际上意味着STRICT_ANSI，因此您应该使用gnu99来拥有该功能。

将通用字符名称转换为C中的相应字符

2 个答案: