在C ++中检测无效的URL / URI编码序列

时间:2013-06-27 23:24:13

标签: c++ url-encoding

我正在使用此代码http://www.w3.org/International/unescape.java的CPP版本将网址编码的字符串解码回原始格式。我希望能够检测到无效的uri / url编码序列,例如%s,%p,%ya,%ax等。但如果我通过字符串'天行者%s又回来了',它会回复'天行者回来'。它切断了无效序列,任何想法如何正确...?或者CPP中的URLDecoder功能在这样的问题上很酷......?

    std::wstring URIDecodeAsWStr(std::string _tmpStrToEncode)
    {
        std::wstring _WStrDecPath = L"";
        int l  = _tmpStrToEncode.length();
        int ch = -1 ;
        int b, sumb = 0;
        for (int i = 0, more = -1 ; i < l ; i++)
        {
            ch = _tmpStrToEncode.at(i);
            switch (ch)
            {
                case '%':
                ++i;
                if(i >= l)
                {
                    b = ch;
                    break;
                }
                ch = _tmpStrToEncode.at(i);
                int hb;
                if(isdigit((unsigned char)ch))
                {
                    hb = ch - '0';
                }
                else
                {
                    hb = 10 + tolower(ch) - 'a';
                }
                hb = hb & 0xF;
                ++i;
                if(i >= l)
                {
                    b = ch;
                    break;
                }
                ch = _tmpStrToEncode.at(i);
                int lb;
                if(isdigit((unsigned char)ch))
                {
                    lb = ch - '0';
                }
                else
                {
                    lb = 10 + tolower(ch) - 'a';
                }
                lb = lb & 0xF;
                b = (hb << 4) | lb ;
                break ;

                default:
                b = ch ;
            }
            if ((b & 0xc0) == 0x80) // 10xxxxxx (continuation byte)
            {
                sumb = (sumb << 6) | (b & 0x3f) ;   // Add 6 bits to sumb
                if (--more == 0)
                {
                WCHAR temp [2];
                temp [0]= sumb;
                temp [1]= L'\0';
                _WStrDecPath.append(temp);              // Add char to sbuf
                }
            } else if ((b & 0x80) == 0x00){ // 0xxxxxxx (yields 7 bits)
                WCHAR temp [2];
                temp [0]= b;
                temp [1]= L'\0';
                _WStrDecPath.append(temp);  // Store in sbuf
            } else if ((b & 0xe0) == 0xc0) {    // 110xxxxx (yields 5 bits)
                sumb = b & 0x1f;
                more = 1;   // Expect 1 more byte
            } else if ((b & 0xf0) == 0xe0) {    // 1110xxxx (yields 4 bits)
                sumb = b & 0x0f;
                more = 2;   // Expect 2 more bytes
            } else if ((b & 0xf8) == 0xf0) {    // 11110xxx (yields 3 bits)
                sumb = b & 0x07;
                more = 3;   // Expect 3 more bytes
            } else if ((b & 0xfc) == 0xf8) {    // 111110xx (yields 2 bits)
                sumb = b & 0x03;
                more = 4;   // Expect 4 more bytes
            } else /*if ((b & 0xfe) == 0xfc)*/ {    // 1111110x (yields 1 bit)
                sumb = b & 0x01;
                more = 5;   // Expect 5 more bytes
            } /* No need to test if the UTF-8 encoding is well-formed */
        }
        return _WStrDecPath;
    }

还有一件事,输入应该是字符串,返回必须是wstring。

0 个答案:

没有答案