Question

我正在使用GCC在64位Solaris系统上调试一些开源代码，它将2字节字符（wchar_t）转换为4字节字符（wchar_t）。因为Solaris和其他一些Unix一样，将wchar_t定义为4byte，而不是像Windows中那样定义2byte。

现在我通过将指针算术放在两行来修复问题，但我不确定原始代码有什么问题。有线索吗？

原始代码

int StringCopy2to4bytes(const unsigned short* src, int src_size, 
                         unsigned int* dst, int dst_size)
{
 int cp_size = 0;

 const unsigned short *src_end = NULL;
 const unsigned int   *dst_end = NULL;

 unsigned int c1, c2;

 src_end = src + src_size;
 dst_end = dst + dst_size;

 while (src < src_end)
 {
     c1 = *src++;
     if ((c1 >= UNI_SUR_HIGH_START) && (c1 <= UNI_SUR_HIGH_END))
     {
         if (src < src_end)
         {
             c2 = *src;
             if ((c2 >= UNI_SUR_LOW_START) && (c2 <= UNI_SUR_LOW_END))
             {
                c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) + 
                      (c1 - UNI_SUR_LOW_START )  + UNI_BASE;

                ++src;
             }
         } 
         else 
             return -1;
     } 

     if (dst >= dst_end) return -2;

     *dst++ = c1;

     cp_size++;
 }

 return cp_size;
}

固定代码

int StringCopy2to4bytes(const unsigned short* src, int src_size, 
                         unsigned int* dst, int dst_size)
{
 int cp_size = 0;

 const unsigned short *src_end = NULL;
 const unsigned int   *dst_end = NULL;

 unsigned int c1, c2;

 src_end = src + src_size;
 dst_end = dst + dst_size;

 while (src < src_end)
 {
     c1 = *src; //FIX
     ++src;

     if ((c1 >= UNI_SUR_HIGH_START) && (c1 <= UNI_SUR_HIGH_END))
     {
         if (src < src_end)
         {
             c2 = *src;
             if ((c2 >= UNI_SUR_LOW_START) && (c2 <= UNI_SUR_LOW_END))
             {
                c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) + 
                      (c1 - UNI_SUR_LOW_START )  + UNI_BASE;

                ++src;
             }
         } 
         else 
             return -1;
     } 

     if (dst >= dst_end) return -2;

     *dst = c1; //FIX
     ++dst;

     cp_size++;
 }

 return cp_size;
}

编辑：对于记录，代码不是我的，我只是使用它，碰巧正在调试它，而不是它有很大的不同，但来源是公平的很大，所以我试图用镊子来修复它，而不是重构一切，反正错误都是错误，我需要修复它并向作者邮寄错误。

常数是：

/* unicode constants */
#define UNI_SHIFT             ((int) 10 )
#define UNI_BASE              ((unsigned int) 0x0010000UL)
#define UNI_MASK              ((unsigned int) 0x3FFUL)
#define UNI_REPLACEMENT_CHAR  ((unsigned int) 0x0000FFFD)
#define UNI_MAX_BMP           ((unsigned int) 0x0000FFFF)
#define UNI_MAX_UTF16         ((unsigned int) 0x0010FFFF)
#define UNI_MAX_UTF32         ((unsigned int) 0x7FFFFFFF)
#define UNI_MAX_LEGAL_UTF32   ((unsigned int) 0x0010FFFF)
#define UNI_SUR_HIGH_START    ((unsigned int) 0xD800)
#define UNI_SUR_HIGH_END      ((unsigned int) 0xDBFF)
#define UNI_SUR_LOW_START     ((unsigned int) 0xDC00)
#define UNI_SUR_LOW_END       ((unsigned int) 0xDFFF)

Answer 1

这里写的代码仍然是错误的 - 当你组合c1和c2时，你需要使用c2！也就是说，在以下几行：

c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) +
      (c1 - UNI_SUR_LOW_START ) + UNI_BASE;

第三次出现的c1实际上应该是c2。

此外，将src_end指针初始化为null然后再初始化为src + src_size似乎很愚蠢。为什么不马上去那儿？

此外，如果保留字符串的开头，cp_size可能会多余;然后它将与（dst - initial_dst）相同。

测试代码 - 使用c1到c2修复 - 使用第一个代码示例，在Solaris 10上使用GCC 4.3.3。显示了32位和64位编译的结果。 Unicode标准第3章表3.4中的数据（从技术上讲，Unicode 5.0而不是5.1.0，但我认为不重要）。

enum { NULL = 0 };
enum { UNI_SUR_HIGH_START = 0xD800, UNI_SUR_HIGH_END = 0xDBFF,
       UNI_SUR_LOW_START  = 0xDC00, UNI_SUR_LOW_END  = 0xDFFF,
       UNI_SHIFT = 10, UNI_BASE = 0x10000 };

int StringCopy2to4bytes(const unsigned short* src, int src_size, 
                         unsigned int* dst, int dst_size)
{
 int cp_size = 0;

 const unsigned short *src_end = NULL;
 const unsigned int   *dst_end = NULL;

 unsigned int c1, c2;

 src_end = src + src_size;
 dst_end = dst + dst_size;

 while (src < src_end)
 {
     c1 = *src++;
     if ((c1 >= UNI_SUR_HIGH_START) && (c1 <= UNI_SUR_HIGH_END))
     {
         if (src < src_end)
         {
             c2 = *src;
             if ((c2 >= UNI_SUR_LOW_START) && (c2 <= UNI_SUR_LOW_END))
             {
                c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) + 
                      (c2 - UNI_SUR_LOW_START )  + UNI_BASE;    /* Fixed */

                ++src;
             }
         } 
         else 
             return -1;
     } 

     if (dst >= dst_end) return -2;

     *dst++ = c1;

     cp_size++;
 }

 return cp_size;
}

#include <stdio.h>
#include <stdlib.h>

int main(void)
{
    unsigned short w2_chars[] = { 0x004D, 0x0430, 0x4E8C, 0xD800, 0xDF02, 0x004D };
    unsigned int   w4_wanted[] = { 0x00004D, 0x000430, 0x004E8C, 0x010302, 0x00004D };
    unsigned int   w4_actual[5];
    int w2_len = 6;
    int w4_len = 5;
    int w4_actlen;
    int i;
    int failed = 0;

    w4_actlen = StringCopy2to4bytes(w2_chars, w2_len, w4_actual, w4_len);
    if (w4_actlen != w4_len)
    {
        failed = 1;
        printf("Length mismatch: got %d, wanted %d\n", w4_actlen, w4_len);
    }
    for (i = 0; i < w4_len; i++)
    {
        if (w4_actual[i] != w4_wanted[i])
        {
            printf("Mismatch: index %d: wanted 0x%06X, actual 0x%06X\n",
                   i, w4_wanted[i], w4_actual[i]);
            failed = 1;
        }
    }
    if (failed == 0)
        printf("No problem observed\n");
    return((failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
}


$ gcc -m32 -O utf.c -o utf32 && ./utf32
No problem observed
$ gcc -m64 -O utf.c -o utf64 && ./utf64
No problem observed
$

我想知道你的编译器是什么 - 或你的测试用例。

这是StringCopy2to4bytes（）函数的修订版本。它检测并报告原始没有的错误情况 - 即当代理对的第二个字不是有效的低代理代码点时，它返回状态-3。

int StringCopy2to4bytes(const unsigned short *src, int src_size, 
                        unsigned int *dst, int dst_size)
{
    const unsigned short *src_end = src + src_size;
    const unsigned int   *dst_end = dst + dst_size;
    const unsigned int   *dst0    = dst;

    while (src < src_end)
    {
        unsigned int c1 = *src++;
        if ((c1 >= UNI_SUR_HIGH_START) && (c1 <= UNI_SUR_HIGH_END))
        {
            if (src >= src_end)
                return -1;
            unsigned int c2 = *src++;
            if ((c2 >= UNI_SUR_LOW_START) && (c2 <= UNI_SUR_LOW_END))
            {
               c1 = ((c1 - UNI_SUR_HIGH_START) << UNI_SHIFT) + 
                     (c2 - UNI_SUR_LOW_START )  + UNI_BASE;    /* Fixed */
            }
            else
                return -3;  /* Invalid second code point in surrogate pair */
        } 
        if (dst >= dst_end)
            return -2; 
        *dst++ = c1;
    }
    return dst - dst0;
}

相同的测试代码产生相同的清洁健康状况。 c2的声明假定您使用的是C99 - 而不是C89。

Answer 2

这闻起来我们可能需要一些（）。

看看差异

c1 = * src ++;
c1 =（* src）++;
c1 = *（src ++）;

我真的很喜欢（），因为它们消除了程序员想要做的事情的一些含糊之处。

/约翰

Answer 3

应该没有区别，++高于*。

快速test.c没有显示任何差异：

#include <stdio.h>

int main()
{
    int a[] = { 0, 1, 2, 3, 4, 5 };
    int * p = a;
    printf( "%d\n", *p );
    printf( "%d\n", *p++ );
    printf( "%d\n", *p );
    printf( "%d\n", *(p++) );
    printf( "%d\n", *p );
    return 0;
}

给出：

是什么让你认为你遇到了修复新代码的问题？

编辑：在某些微不足道的情况下查找编译器错误的可能性极小。上述测试使用GCC 4.1.2进行。

编辑2：您的某些类型不匹配。 c1是unsigned int，* src是unsigned short。大小应该是size_t，而不是int。如果您修复原始代码的问题是否仍然存在？

这两个版本的代码（指针算术和unicode）有什么区别？

3 个答案: