我已经编写了用于验证非utf8字符串的代码。但是代码给出的输出表明该字符串是有效的。即使我们输入了无效字符,也会通过编辑器编码将其转换为有效utf8。需要帮助使用无效的utf8字符。需要包含无效utf8字符的字符串的帮助。 我提供了不同的输入,这些输入是无效的utf8字符。但是代码给出的输出是有效的字符串。我正在Linux中构建代码
#include <iostream>
#include <string.h>
using namespace std;
bool bFixString (char * Utf8) ;
int main()
{
char * sequence = "ðŸŒ";
int len = strlen(sequence) ;
cout << "String length = " << len <<endl;
cout << "Original String:" << sequence << endl;
bool s = bFixString(sequence);
cout << s <<endl;
return 0 ;
}
bool bFixString ( char * Utf8)
{
unsigned char * p = ( unsigned char *)Utf8;
unsigned int cnt,t;
int i=0;
bool result = false;
while (p[i] != NULL)
{
if (p[i] >= 0xC0)
{
// Is a multi-byte symbol. Count bytes needed for this symbol
for (cnt = 2; cnt < 7; cnt++)
if (!(p[i] & (0x80 >> cnt)))
{
break;
}
// now we need this number of corret continue-bytes.
for (t = 1; t < cnt; t++)
{
if (((p[i+t]) & 0xC0u) != 0x80)
{
// bad follow-up sequence
p[i] = '?'; // overwrite first byte (!) with a question mark. Follow-up bytes will be overwritten later in the loop.
cnt = 1;
result = true;
break;
}
}
// skip to next.
i+= cnt;
continue;
}
else if (p[i] >= 0x80)
{
// unexpected second byte
p[i] = '?';
result = true;
}
// Otherwise, its a normal character.
i++;
}
cout<<"changed string :"<<p<<endl;
return result;
}
预期结果是返回值应为1,无效字符应替换为“?”但返回0。