我正在使用此代码http://www.w3.org/International/unescape.java的CPP版本将网址编码的字符串解码回原始格式。我希望能够检测到无效的uri / url编码序列,例如%s,%p,%ya,%ax等。但如果我通过字符串'天行者%s又回来了',它会回复'天行者回来'。它切断了无效序列,任何想法如何正确...?或者CPP中的URLDecoder功能在这样的问题上很酷......?
std::wstring URIDecodeAsWStr(std::string _tmpStrToEncode)
{
std::wstring _WStrDecPath = L"";
int l = _tmpStrToEncode.length();
int ch = -1 ;
int b, sumb = 0;
for (int i = 0, more = -1 ; i < l ; i++)
{
ch = _tmpStrToEncode.at(i);
switch (ch)
{
case '%':
++i;
if(i >= l)
{
b = ch;
break;
}
ch = _tmpStrToEncode.at(i);
int hb;
if(isdigit((unsigned char)ch))
{
hb = ch - '0';
}
else
{
hb = 10 + tolower(ch) - 'a';
}
hb = hb & 0xF;
++i;
if(i >= l)
{
b = ch;
break;
}
ch = _tmpStrToEncode.at(i);
int lb;
if(isdigit((unsigned char)ch))
{
lb = ch - '0';
}
else
{
lb = 10 + tolower(ch) - 'a';
}
lb = lb & 0xF;
b = (hb << 4) | lb ;
break ;
default:
b = ch ;
}
if ((b & 0xc0) == 0x80) // 10xxxxxx (continuation byte)
{
sumb = (sumb << 6) | (b & 0x3f) ; // Add 6 bits to sumb
if (--more == 0)
{
WCHAR temp [2];
temp [0]= sumb;
temp [1]= L'\0';
_WStrDecPath.append(temp); // Add char to sbuf
}
} else if ((b & 0x80) == 0x00){ // 0xxxxxxx (yields 7 bits)
WCHAR temp [2];
temp [0]= b;
temp [1]= L'\0';
_WStrDecPath.append(temp); // Store in sbuf
} else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits)
sumb = b & 0x1f;
more = 1; // Expect 1 more byte
} else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits)
sumb = b & 0x0f;
more = 2; // Expect 2 more bytes
} else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits)
sumb = b & 0x07;
more = 3; // Expect 3 more bytes
} else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits)
sumb = b & 0x03;
more = 4; // Expect 4 more bytes
} else /*if ((b & 0xfe) == 0xfc)*/ { // 1111110x (yields 1 bit)
sumb = b & 0x01;
more = 5; // Expect 5 more bytes
} /* No need to test if the UTF-8 encoding is well-formed */
}
return _WStrDecPath;
}
还有一件事,输入应该是字符串,返回必须是wstring。