Question

我有两个向量（每个向量只有唯一元素），它们共享一组整数。我想计算一个矢量的元素的索引，它们也尽可能有效地存在于另一个矢量中。你能否超越我卑微的低效率实施？

修改向量没有排序，我们需要未排序向量的索引。此外，在解决问题时，禁止修改初始向量（random_vec_1和random_vec_2）。

#include <chrono>
#include <iostream>
#include <random>
#include <set>
#include <unordered_set>
#include <vector>

using namespace std::chrono;

int main() {

    // Setup 1: Construct two vectors with random integers.
    constexpr size_t num = 1000;

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(0, num);

    std::vector<int> random_vec_1;
    std::vector<int> random_vec_2;
    random_vec_1.reserve(num);
    random_vec_2.reserve(num);
    for (size_t i = 0u; i < num; ++i) {
        random_vec_1.push_back(dis(gen));
        random_vec_2.push_back(dis(gen));
    }
    // Setup 2: Make elements unique and shuffle them.
    std::set<int> s1(random_vec_1.begin(), random_vec_1.end());
    std::set<int> s2(random_vec_2.begin(), random_vec_2.end());
    random_vec_1.assign(s1.begin(), s1.end());
    random_vec_2.assign(s2.begin(), s2.end());
    std::random_shuffle(random_vec_1.begin(), random_vec_1.end());
    std::random_shuffle(random_vec_2.begin(), random_vec_2.end());


    std::cout << "size random_vec_1: " << random_vec_1.size() << "\n";
    std::cout << "size random_vec_2: " << random_vec_2.size() << "\n";

    auto begin1 = high_resolution_clock::now();

    // Solve problem -------------------------------------------
    std::vector<size_t> match_index_2;
    std::unordered_set<int> my_set(random_vec_1.begin(), random_vec_1.end());
    for (size_t i = 0u; i < random_vec_2.size(); ++i) {
        if (my_set.count(random_vec_2[i]) == 1u)
            match_index_2.push_back(i);
    }
    // ---------------------------------------------------------

    auto end1 = high_resolution_clock::now();
    auto ticks1 = duration_cast<microseconds>(end1-begin1);
    std::cout << "Set approach took " << ticks1.count() << " microseconds.\n";
    std::cout << "Number of common indices: " << match_index_2.size() << "\n";

}

Answer 1

vector现在如此之快，以至于我不会使用set：

将第一个矢量复制到例如new_vector_1;
排序new_vector_1;
使用binary_search在new_vector_1中找到值。

<强>代码：

std::vector<int> new_vec_1(random_vec_1);
std::sort(std::begin(new_vec_1), std::end(new_vec_1));
std::vector<size_t> match_index_2;
match_index_2.reserve(random_vec_2.size());

for (size_t i = 0; i < random_vec_2.size(); ++i) {
    if (std::binary_search(std::begin(new_vec_1), 
                           std::end(new_vec_1),
                           random_vec_2[i])) {
        match_index_2.push_back(i);
    }
}

请参阅ideone上的代码 - 代码的速度是set版本的两倍，我认为它可能会进一步优化。

请注意，此代码在算法上与您的代码相同，但std::vector速度非常快，您可以获得更好的性能。

这是另一种对两个向量进行排序的方法（但速度要快一些）：

std::vector<int> new_vec_1(random_vec_1);
std::vector<int> new_vec_2(random_vec_2);
std::sort(std::begin(new_vec_1), std::end(new_vec_1));
std::sort(std::begin(new_vec_2), std::end(new_vec_2));
std::vector<size_t> match_index_2;
match_index_2.reserve(random_vec_2.size());

for (auto it1 = new_vec_1.begin(), it2 = new_vec_2.begin();
     it1 != new_vec_1.end() && it2 != new_vec_2.end();
     ++it2) {
    while (it1 != new_vec_1.end() && *it1 < *it2) ++it1;
    if (it1 != new_vec_1.end() && *it1 == *it2) {
        match_index_2.push_back(it2 - new_vec_2.begin());
    }
}

Answer 2

新答案

新要求是在计算解决方案时无法修改原始向量。排序交集解决方案不再起作用，因为索引混合在一起。

以下是我的建议：将第一个矢量值映射到具有unordered_map的相应索引，然后运行第二个矢量值。

// Not necessary, might increase performance
match_index_2.reserve(std::min(random_vec_1.size(), random_vec_2.size()));

std::unordered_map<int, int> index_map;
// random_vec_2 is the one from which we want the indices.
index_map.reserve(random_vec_2.size());
for (std::size_t i = 0; i < random_vec_2.size(); ++i) {
    index_map.emplace(random_vec_2[i], i);
}

for (auto& it : random_vec_1) {
    auto found_it = index_map.find(it);
    if (found_it != index_map.end()) {
        match_index_2.push_back(found_it->second);
    }
}

此外，如果向量中的值位于相对较小的范围内（这是user2079303询问的那个），则可以使用向量替换地图，这可能会进一步提高性能。在下文中，我假设值在[0，num]范围内。

match_index_2.reserve(std::min(random_vec_1.size(), random_vec_2.size()));

constexpr std::size_t unmapped = -1; // -1 or another unused index
// Since std::size_t is an unsigned type, -1 will actually be the maximum value it can hold.

std::vector<std::size_t> index_map(num, unmapped);
for (std::size_t i = 0; i < random_vec_2.size(); ++i) {
    index_map[random_vec_2[i]] = i;
}

for (auto& it : random_vec_1) {
    auto index = index_map[it];
    if (index != unmapped) {
        match_index_2.push_back(index);
    }
}

以前的回答

由于您的向量已经排序（在使用std::set保留唯一元素之后），您可以使用此算法：

auto first1 = random_vec_1.begin();
auto last1 = random_vec_1.end();
auto first2 = random_vec_2.begin();
auto last2 = random_vec_2.end();
auto index_offset = first1; // Put first2 if you want the indices of the second vector instead

while (first1 != last1 && first2 != last2)
    if (*first1 < *first2)
        ++first1;
    else if (*first2 < *first1)
        ++first2;
    else {
        match_index_2.push_back(std::distance(index_offset, first1));
        ++first1;
        ++first2;
    }
}

改编自the gcc libstdc++ source code for std::set_intersection。

这是另一个版本，改编自cppreference：

auto first1 = random_vec_1.begin();
auto last1 = random_vec_1.end();
auto first2 = random_vec_2.begin();
auto last2 = random_vec_2.end();
auto index_offset = first1; // Put first2 if you want the indices of the second vector instead

while (first1 != last1 && first2 != last2) {
    if (*first1 < *first2) {
        ++first1;
    } else  {
        if (!(*first2 < *first1)) {
            match_index_2.push_back(std::distance(index_offset, first1++));
        }
        ++first2;
    }
}

如果您想提高效率，请在reserve之前致电match_index_2。此外，您可以使用std::sort和std::unique来取消集合。

// Setup 2: Make elements unique.
auto first1 = random_vec_1.begin();
auto last1 = random_vec_1.end();
std::sort(first1, last1);
last1 = std::unique(first1, last1);
random_vec_1.erase(last1, random_vec_1.end());

auto first2 = random_vec_2.begin();
auto last2 = random_vec_2.end();
std::sort(first2, last2);
last2 = std::unique(first2, last2);
random_vec_2.erase(last2, random_vec_2.end());

Answer 3

您可以在值集中创建索引并对其进行操作：

#include <algorithm>
#include <vector>

inline std::vector<std::size_t>  make_unique_sorted_index(const std::vector<int>& v) {
    std::vector<std::size_t> result(v.size());
    std::iota(result.begin(), result.end(), 0);
    std::sort(result.begin(), result.end(),
        [&v] (std::size_t a, std::size_t b) {
            return v[a] < v[b];
    });
    auto obsolete = std::unique(result.begin(), result.end(),
        [&v] (std::size_t a, std::size_t b) {
            return v[a] == v[b];
    });
    result.erase(obsolete, result.end());
    return result;
}

// Constructs an unordered range of indices [i0, i1, i2, ...iN) into the first set
// for elements that are found uniquely in both sets.
// Note: The sequence [set1[i0], set1[i1], set1[i2], ... set1[iN]) will be sorted.
std::vector<std::size_t>  unordered_set_intersection(
    const std::vector<int>& set1,
    const std::vector<int>& set2)
{
    std::vector<std::size_t> result;
    result.reserve(std::min(set1.size(), set2.size()));
    std::vector<std::size_t> index1 = make_unique_sorted_index(set1);
    std::vector<std::size_t> index2 = make_unique_sorted_index(set2);

    auto i1 = index1.begin();
    auto i2 = index2.begin();
    while(i1 != index1.end() && i2 != index2.end()) {
        if(set1[*i1] < set2[*i2]) ++i1;
        else if(set2[*i2] < set1[*i1]) ++i2;
        else {
            result.push_back(*i1);
            ++i1;
            ++i2;
        }
    }
    result.shrink_to_fit();
    return result;
}

注意：跳过第二个索引并对第二个集合的副本进行操作可能会提高性能。

或者，make_unique_sorted_index可能会替换为：

inline std::vector<std::size_t>  make_sorted_index(const std::vector<int>& v) {
    std::vector<std::size_t> result(v.size());
    std::iota(result.begin(), result.end(), 0);
    std::sort(result.begin(), result.end(),
        [&v] (std::size_t a, std::size_t b) {
            return v[a] < v[b];
    });
    return result;
}

如果索引是唯一的，算法会产生稳定的结果：

元素的排序（指向的结果索引）与std :: sort一样稳定。
如果索引不是唯一的，则相同元素的数量（指向的结果索引）分别是第一或第二组中相同元素的最小数量。

Answer 4

实际上，我希望对向量进行排序，使其基本上优于std::set的创建，因为STL集是一棵树，vector的{{1}}可以在使用计数排序的线性时间，如果你不计数超过一，将给你一个集合。对于成本日志n的n次插入，创建集合为O（n log n），而如上所述，排序为O（n）。

在排序后的矢量上，您可以运行int，它也应该按时间线性运行到两个输入中较大的一个。

因此，您应该能够在线性时间内完成此操作。

如果无法修改矢量，则可以使用散列图（std :: unordered_map）将值映射到原始矢量中的索引。请注意，由于您没有提到数字是唯一的，您会发现结果，例如值x_1，...，x_n都包含在两个集合中，然后您将使用地图将其投影回原始中的索引矢量使用hashmap。

有效地计算两个向量的共同元素的指数

4 个答案:

新答案

以前的回答