Question

我有这样的功能。

输入参数-用户名矢量，字符串矢量，最高用户数。

首先，我计算每个用户在字符串中的出现次数。如果一个字符串中有多个出现次数-仍然算作1。

然后我按出现次数排序。如果出现次数相等，请按字母顺序对用户名进行排序。

该函数返回出现率最高的前N个用户。

std::vector<std::string> GetTopUsers(const std::vector<std::string>& users,
    const std::vector<std::string>& lines, const int topUsersNum) {
    std::vector<std::pair<std::string, int>> userOccurancies;

    //count user occurancies
    for (const auto & user : users) {
        int count = 0;
        for (const auto &line : lines) {
            std::regex rgx("\\b" + user + "\\b", std::regex::icase);
            std::smatch match;
            if (std::regex_search(line, match, rgx)) {
                ++count;
                auto userIter = std::find_if(userOccurancies.begin(), userOccurancies.end(),
                    [&user](const std::pair<std::string, int>& element) { return element.first == user; });
                if (userIter == userOccurancies.end()) {
                    userOccurancies.push_back(std::make_pair(user, count));
                }
                else {
                    userIter->second = count;
                }
            }
        }
    }

    //sort by amount of occurancies, if occurancies are equal - sort alphabetically
    std::sort(userOccurancies.begin(), userOccurancies.end(),
        [](const std::pair<std::string, int>& p1, const std::pair<std::string, int>& p2)
    { return (p1.second > p2.second) ? true : (p1.second == p2.second ? p1.first < p2.first : false); });

    //extract top N users
    int topUsersSz = (topUsersNum <= userOccurancies.size() ? topUsersNum : userOccurancies.size());
    std::vector<std::string> topUsers(topUsersSz);
    for (int i = 0; i < topUsersSz; i++) {
        topUsers.push_back(userOccurancies[i].first);
    }

    return topUsers;
}

所以输入

    std::vector<std::string> users = { "john", "atest", "qwe" };
    std::vector<std::string> lines = { "atest john", "Qwe", "qwe1", "qwe," };

    int topUsersNum = 4;

输出将为qwe atest john

但是看起来很复杂。 O（n ^ 2）用于循环+正则表达式。它必须是O（n ^ 3）或更大。

能否给我一些建议，以使其在c ++ 11中更简单地实现？

还请给我有关代码的建议。

或者也许有更好的董事会来解决有关复杂性和性能的问题？

谢谢。

UDP

   std::vector<std::string> GetTopUsers2(const std::vector<std::string>& users,
    const std::vector<std::string>& lines, const size_t topUsersNum) {
    std::vector<std::pair<std::string, int>> userOccurancies(users.size());

    auto userOcIt = userOccurancies.begin();
    for (const auto & user : users) {
        userOcIt->first = std::move(user);
        userOcIt->second = 0;
        userOcIt++;
    }

    //count user occurancies
    for (auto &user: userOccurancies) {
        int count = 0;
        std::regex rgx("\\b" + user.first + "\\b", std::regex::icase);
        std::smatch match;
        for (const auto &line : lines) {
            if (std::regex_search(line, match, rgx)) {
                ++count;
                user.second = count;
            }
        }
    }

    //sort by amount of occurancies, if occurancies are equal - sort alphabetically
    std::sort(userOccurancies.begin(), userOccurancies.end(),
        [](const std::pair<std::string, int>& p1, const std::pair<std::string, int>& p2)
    { return (p1.second > p2.second) ? true : (p1.second == p2.second ? p1.first < p2.first : false); });

    //extract top N users
    auto middle = userOccurancies.begin() + std::min(topUsersNum, userOccurancies.size());
    int topUsersSz = (topUsersNum <= userOccurancies.size() ? topUsersNum : userOccurancies.size());
    std::vector<std::string> topUsers(topUsersSz);
    auto topIter = topUsers.begin();
    for (auto iter = userOccurancies.begin(); iter != middle; iter++) {
        *topIter = std::move(iter->first);
        topIter++;
    }

    return topUsers;
}

感谢@ Jarod42。我更新了第一部分。我认为在构造函数中一次向vector分配内存要比每次调用emplace_back都要快，所以我使用了它。如果我错了-标记我。

我也使用c ++ 11，而不是c ++ 17。

时间结果：

Old: 3539400.00000 nanoseconds
New: 2674000.00000 nanoseconds

更好，但看起来仍然复杂，不是吗？

Answer 1

构造正则表达式的成本很高，并且可以在循环外移动：

您也可以移动字符串而不是复制。

您不需要对所有范围进行排序。 std::partial_sort就足够了。

更重要的是，您可以避免使用内部find_if。

std::vector<std::string>
GetTopUsers(
    std::vector<std::string> users,
    const std::vector<std::string>& lines,
    int topUsersNum)
{
    std::vector<std::pair<std::string, std::size_t> userCount;
    userCount.reserve(users.size());

    for (auto& user : users) {
        userCount.emplace_back(std::move(user), 0);
    }

    for (auto& [user, count] : userCount) {
        std::regex rgx("\\b" + user + "\\b", std::regex::icase);
        for (const auto &line : lines) {
            std::smatch match;
            if (std::regex_search(line, match, rgx)) {
                ++count;
            }
        }
    }

    //sort by amount of occurancies, if occurancies are equal - sort alphabetically
    auto middle = userCount.begin() + std::min(topUsersNum, userCount.size());
    std::partial_sort(userCount.begin(),
                      middle,
                      userCount.end(),
                      [](const auto& lhs, const auto& rhs)
        {
            return std::tie(rhs.second, lhs.first) < std::tie(lhs.second, rhs.first);
        });

    //extract top N users
    std::vector<std::string> topUsers;
    topUsers.reserve(std::distance(userCount.begin(), middle));
    for (auto it = userCount.begin(); it != middle; ++it) {
        topUsers.push_back(std::move(it->first));
    }
    return topUsers;
}

Answer 2

我不是专业的编码人员，但是我使您的代码速度更快（除非我的数学错误或我将时间设置错了，否则速度会加快90％）。

它的作用是遍历每一行，并针对每一行计算给定每个用户的出现次数。如果当前用户的出现次数大于前一个用户的出现次数，则会将用户移动到向量的开头。

#include <iostream>
#include <Windows.h>
#include <vector>
#include <string>
#include <regex>
#include <algorithm>
#include <chrono>

std::vector<std::string> GetTopUsers(const std::vector<std::string>& users,
    const std::vector<std::string>& lines, const int topUsersNum) {
    std::vector<std::pair<std::string, int>> userOccurancies;

    //count user occurancies
    for (const auto & user : users) {
        int count = 0;
        for (const auto &line : lines) {
            std::regex rgx("\\b" + user + "\\b", std::regex::icase);
            std::smatch match;
            if (std::regex_search(line, match, rgx)) {
                ++count;
                auto userIter = std::find_if(userOccurancies.begin(), userOccurancies.end(),
                    [&user](const std::pair<std::string, int>& element) { return element.first == user; });
                if (userIter == userOccurancies.end()) {
                    userOccurancies.push_back(std::make_pair(user, count));
                }
                else {
                    userIter->second = count;
                }
            }
        }
    }

    //sort by amount of occurancies, if occurancies are equal - sort alphabetically
    std::sort(userOccurancies.begin(), userOccurancies.end(),
        [](const std::pair<std::string, int>& p1, const std::pair<std::string, int>& p2)
    { return (p1.second > p2.second) ? true : (p1.second == p2.second ? p1.first < p2.first : false); });

    //extract top N users
    int topUsersSz = (topUsersNum <= userOccurancies.size() ? topUsersNum : userOccurancies.size());
    std::vector<std::string> topUsers(topUsersSz);
    for (int i = 0; i < topUsersSz; i++) {
        topUsers.push_back(userOccurancies[i].first);
    }

    return topUsers;
}

unsigned int count_user_occurences(
    std::string & line,
    std::string & user
)
{
    unsigned int occur                  = {};
    std::string::size_type curr_index   = {};

    // while we can find the name of the user in the line, and we have not reached the end of the line
    while((curr_index = line.find(user, curr_index)) != std::string::npos)
    {
        // increase the number of occurences
        ++occur;
        // increase string index to skip the current user
        curr_index += user.length();
    }

    // return the number of occurences
    return occur;
}

std::vector<std::string> get_top_users(
    std::vector<std::string> & user_list,
    std::vector<std::string> & line_list
)
{
    // create vector to hold results
    std::vector<std::string> top_users = {};

    // put all of the users inside the "top_users" vector
    top_users = user_list;

    // make sure none of the vectors are empty
    if(false == user_list.empty()
        && false == line_list.empty())
    {
        // go trough each one of the lines
        for(unsigned int i = {}; i < line_list.size(); ++i)
        {
            // holds the number of occurences for the previous user
            unsigned int last_user_occur = {};

            // go trough each one of the users (we copied the list into "top_users")
            for(unsigned int j = {}; j < top_users.size(); ++j)
            {
                // get the number of the current user in the current line
                unsigned int curr_user_occur = count_user_occurences(line_list.at(i), top_users.at(j));
                // user temporary name holder
                std::string temp_user = {};

                // if the number of occurences of the current user is larger than the one of the previous user, move it at the top
                if(curr_user_occur >= last_user_occur)
                {
                    // save the current user's name
                    temp_user = top_users.at(j);

                    // erase the user from its current position
                    top_users.erase(top_users.begin() + j);

                    // move the user at the beginning of the vector
                    top_users.insert(top_users.begin(), temp_user);
                }

                // save the occurences of the current user to compare further users
                last_user_occur = curr_user_occur;
            }
        }
    }

    // return the top user vector
    return top_users;
}

int main()
{
    std::vector<std::string> users = { "john", "atest", "qwe" };
    std::vector<std::string> lines = { "atest john", "Qwe", "qwel", "qwe," };

    // time the first function
    auto start = std::chrono::high_resolution_clock::now();
    std::vector<std::string> top_users = get_top_users(users, lines);   
    auto stop = std::chrono::high_resolution_clock::now();
    // save the time in milliseconds
    double time = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count();

    // print time
    printf("%.05f nanoseconds\n", time);

    // time the second function
    auto start2 = std::chrono::high_resolution_clock::now();    
    std::vector<std::string> top_users2 = GetTopUsers(users, lines, 4);
    auto stop2 = std::chrono::high_resolution_clock::now();
    // save the time in milliseconds
    double time2 = std::chrono::duration_cast<std::chrono::nanoseconds>(stop2 - start2).count();

    // print time
    printf("%.05f nanoseconds", time2);

    getchar();

    return 0;
}

结果（至少对于我的PC，它们在多次运行中非常一致）：

366800.00000 nanoseconds
4235900.00000 nanoseconds

如何使用嵌套循环和正则表达式降低c ++下的时间复杂度？

2 个答案: