在进行url编码时,std :: regex_replace不适用于字符" +"

时间:2018-05-16 06:46:12

标签: c++ c++11 boost boost-regex

以下是代码片段,regex_replace dosn适用于字符" +",我不应该对字符使用特殊处理,但它应该正常工作。

/*All headerfiles are available.*/



std::string charToHex(unsigned char c, bool bUpperCase);
std::string urlEncode(const std::string& toEncode, bool bEncodeForwardSlash);
std::string getEncodedUrl(const std::string& url){
std::string bktObjKey = "";

std::string urlEnc = url;

boost::regex expression("^(([^:/?#]+):)?(//([^/?#:]*)(:\\d+)?)?([^?#]*)((\\?[^#]*))?(#(.*))?");

std::string::const_iterator start=url.begin(), end = url.end();
boost::match_results<std::string::const_iterator> what;
boost::match_flag_type flags = boost::match_default;
if (regex_search(url.begin(), url.end(), what, expression, flags)) {
  std::cout<<"Matched"<<std::endl;
  bktObjKey.insert(bktObjKey.begin(), what[6].first, what[6].second);

  std::regex fobj(bktObjKey);
  /*std::string fobj = bktObjKey;*/

  /*auto pos = url.find(bktObjKey);*/
  bktObjKey = urlEncode(bktObjKey, false);
  std::cout<<"bktObjKey :"<<bktObjKey.c_str()<<" urlEnc: "<<urlEnc.c_str()<<std::endl;

  urlEnc = std::regex_replace(url, fobj, bktObjKey);
  std::cout<<" urlEnc: "<<urlEnc.c_str()<<std::endl;
}
  return urlEnc;
}
std::string urlEncode(const std::string& toEncode, bool bEncodeForwardSlash)  {
  std::ostringstream out;

  std::cout<<"inside encode"<<std::endl;
  for(std::string::size_type i=0; i < toEncode.length(); ++i) {
    char ch = toEncode.at(i);
    if ((ch >= 'A' && ch <= 'Z') ||
        (ch >= 'a' && ch <= 'z') ||
        (ch >= '0' && ch <= '9') ||
        (ch == '_' || ch == '-' || ch == '~' || ch == '.') ||
        (ch == '/' && !bEncodeForwardSlash)) {
      out << ch;
      std::cout<<out.str()<<" Is not coded to HEX"<<std::endl;
    }
    else {
      out << "%" <<  charToHex(ch, true);
      std::cout<<out.str()<<" Is coded to HEX"<<std::endl;
    }
  }
  std::cout<<"Return :"<<out.str()<<std::endl;
  return out.str();
}

std::string charToHex(unsigned char c, bool bUpperCase) {
  short i = c;
  std::stringstream s;
  s << std::setw(2) << std::setfill('0') << std::hex << i;
  return s.str();
}

int main() {

std::string url1 ="http://10.130.0.36/rbkt10/+";
std::string out1 = getEncodedUrl(url1);
std::cout<<"Encoded URL1=:"<<out1<<std::endl;

return 0;
}

输出: 编码的URL1 =:http://10.130.0.36/rbkt10/%2b+

因此输出变为&#34; ++&#34;。它应该只是&#34; +&#34;。我怎样才能让它完美运作?

1 个答案:

答案 0 :(得分:4)

  1. 您将原始字符串解释为正则表达式。 +在regex¹中很特别。

    您应该只使用std::string::replace,因为您不需要正则表达式替换功能:

    boost::smatch what;
    if (regex_search(url.cbegin(), url.cend(), what, expression)) {
        boost::ssub_match query = what[6];
        url.replace(query.first, query.second, urlEncode(query.str(), false));
    }
    
  2. 复杂,分散的代码如下:
    可能只是:

    std::string bktObjKey = what[6].str();
    
  3. 复杂的循环

    for (std::string::size_type i = 0; i < toEncode.length(); ++i) {
         char ch = toEncode.at(i);
    

    可能只是

    for (char ch : toEncode) {
    
  4. charToHex每次创建一个新的2-char字符串,每次使用另一个字符串流,将结果复制到字符串流等等。相反,只需写入您拥有的字符串流并避免所有低效率:< / p>

    void writeHex(std::ostream& os, unsigned char c, bool uppercase) {
        os << std::setfill('0') << std::hex;
        if (uppercase) 
            os << std::uppercase;
        os << std::setw(2) << static_cast<int>(c);
    }
    
      

    请注意,这也解决了您忘记使用bUppercase

    这一事实的问题
  5. 请查看<cctype>以获取有关字符分类的帮助。

  6. 使用原始文字写

    boost::regex expression("^(([^:/?#]+):)?(//([^/?#:]*)(:\\d+)?)?([^?#]*)((\\?[^#]*))?(#(.*))?");
    

    代替:

    boost::regex expression(R"(^(([^:/?#]+):)?(//([^/?#:]*)(:\d+)?)?([^?#]*)((\?[^#]*))?(#(.*))?)");
    

    (无需双倍逃避\d\?

  7. 删除所有冗余子组

    boost::regex expression(R"(^([^:/?#]+:)?(//[^/?#:]*(:\d+)?)?[^?#]*(\?[^#]*)?(#.*)?)");
    

    或使它们可维护和有用²:

    boost::regex uri_regex(
        R"(^((?<scheme>[^:/?#]+):)?)"
        R"((?<authority>//(\?<host>[^/?#:]*)(:(?<port>\d+))?)?)"
        R"((?<path>[^?#]*))"
        R"((\?(?<query>([^#]*)))?)"
        R"((#(?<fragment>.*))?)");
    
  8. 既然您可以访问URI的逻辑组件,请应用它以更好地了解编码的时间和位置:

        std::string escaped = 
           what["scheme"].str() + 
           what["authority"].str() +
           urlEncode(what["path"].str(), false);
    
        if (query.matched) {
            escaped += '?';
            escaped.append(urlEncode(query, true));
        }
    
        if (fragment.matched) {
            escaped += '#';
            escaped.append(urlEncode(fragment, true));
        }
    
  9. 重载urlEncode,它会使用现有的ostream引用,而不是始终创建自己的引用:

    std::ostringstream out;
    out << what["scheme"] << what["authority"];
    urlEncode(out, what["path"], false);
    
    if (query.matched)
        urlEncode(out << '?', query, true);
    
    if (fragment.matched)
        urlEncode(out << '#', fragment, true);
    
  10. 审核后的代码

    <强> Live On Coliru

    #include <boost/regex.hpp>
    #include <iostream>
    #include <iomanip>
    
    void writeHex(std::ostream& os, unsigned char c, bool uppercase) {
        os << std::setfill('0') << std::hex;
        if (uppercase) 
            os << std::uppercase;
        os << '%' << std::setw(2) << static_cast<int>(c);
    }
    
    void urlEncode(std::ostream& os, const std::string &toEncode, bool bEncodeForwardSlash) {
        auto is_safe = [=](uint8_t ch) {
            return std::isalnum(ch) ||
                (ch == '/' && !bEncodeForwardSlash) ||
                std::strchr("_-~.", ch);
        };
    
        for (char ch : toEncode) {
            if (is_safe(ch))
                os << ch;
            else
                writeHex(os, ch, true);
        }
    }
    
    std::string urlEncode(const std::string &toEncode, bool bEncodeForwardSlash) {
        std::ostringstream out;
        urlEncode(out, toEncode, bEncodeForwardSlash);
        return out.str();
    }
    
    std::string getEncodedUrl(std::string url) {
    
        boost::regex uri_regex(
            R"(^((?<scheme>[^:/?#]+):)?)"
            R"((?<authority>//(\?<host>[^/?#:]*)(:(?<port>\d+))?)?)"
            R"((?<path>[^?#]*))"
            R"((\?(?<query>([^#]*)))?)"
            R"((#(?<fragment>.*))?)");
    
        boost::match_results<std::string::iterator> what;
        //boost::smatch what;
        if (regex_search(url.begin(), url.end(), what, uri_regex)) {
            auto& full     = what[0];
            auto& query    = what["query"];
            auto& fragment = what["fragment"];
    
            std::ostringstream out;
            out << what["scheme"] << what["authority"];
            urlEncode(out, what["path"], false);
    
            if (query.matched)
                urlEncode(out << '?', query, true);
    
            if (fragment.matched)
                urlEncode(out << '#', fragment, true);
    
            url.replace(full.begin(), full.end(), out.str());
        }
        return url;
    }
    
    int main() {
        for (std::string url : { 
                "http://10.130.0.36/rbkt10/+",
                "//10.130.0.36/rbkt10/+",
                "//localhost:443/rbkt10/+",
                "https:/rbkt10/+",
                "https:/rbkt10/+?in_params='please do escape / (forward slash)'&more#also=in/fragment",
                "match inside text http://10.130.0.36/rbkt10/+ is a bit fuzzy",
              }) {
            std::cout << "Encoded URL: " << getEncodedUrl(url) << std::endl;
        }
    }
    

    打印

    Encoded URL: http//10.130.0.36/rbkt10/%2B
    Encoded URL: //10.130.0.36/rbkt10/%2B
    Encoded URL: //localhost%3A443/rbkt10/%2B
    Encoded URL: https/rbkt10/%2B
    Encoded URL: https/rbkt10/%2B?in_params%3D%27please%20do%20escape%20%2F%20%28forward%20slash%29%27%26more#also%3Din%2Ffragment
    Encoded URL: match inside text http//10.130.0.36/rbkt10/%2B%20is%20a%20bit%20fuzzy
    

    注意

    请注意,代码STILL不符合规范:

      

    enter image description here

    这就是您使用库的原因。

    ¹(这会导致+从输入中消失。它不是“重复”,它只是没有被替换,因为/+表示1个或更多/)。

    ²参见https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Generic_syntax