确定向量y的元素是否出现在向量x中的最快方法

时间:2016-03-06 10:37:00

标签: c++

我有以下问题:我有两个类型为double的向量xy越来越多的排序,我想获得一个向量z,指示{的元素是否{ {1}}中存在{1}}。到目前为止,我已经在for循环中使用y,如下图所示,但我认为应该有一种更快的方式来利用x排序的事实? 问题是,这需要超快,因为它证明是我的代码中的瓶颈。

对于熟悉R的人,我需要等同于std::binary_search

x

修改

以下是我的问题的代表性示例数据:

match(y, x, nomatch = 0L) > 0L

8 个答案:

答案 0 :(得分:6)

您可以使用std::set_itersection

#include <vector>
#include <algorithm>
#include <iterator>
#include <iostream>

int main()
{
    std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    std::vector<double> z {};

    std::set_intersection(std::cbegin(x), std::cend(x), 
                          std::cbegin(y), std::cend(y), 
                          std::back_inserter(z));

   std::copy(std::cbegin(z), std::cend(z),
             std::ostream_iterator<double> {std::cout, " "});
}

修改

要在评论中提到DieterLücking点,这里有一个更接近R匹配函数的版本:

#include <vector>
#include <deque>
#include <algorithm>
#include <iterator>
#include <functional>
#include <memory>
#include <iostream>

template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
    std::vector<std::reference_wrapper<const T>> z {};
    z.reserve(std::min(y.size(), x.size()));

    std::set_intersection(std::cbegin(y), std::cend(y),
                          std::cbegin(x), std::cend(x),
                          std::back_inserter(z));

    std::deque<bool> result(y.size(), false);

    for (const auto& e : z) {
        result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
    }

    return result;
}

int main()
{
    std::vector<double> x {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    std::vector<double> y {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    const auto matches = match(y, x);

    std::copy(std::cbegin(matches), std::cend(matches),
              std::ostream_iterator<bool> {std::cout});
}

答案 1 :(得分:3)

我拿起你所有的代码,Dieter计时样本和OP的5000个随机双打的样本数据来执行所有替代方案的更完整的计时。这是代码:

#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>
#include <iterator>
#include <cstdlib>
#include <ctime>
#include <assert.h>
#include <deque>
#include <functional>
#include <memory>

using namespace std;

double RandomNumber () { return (std::rand() / 10e+7); }

template <typename T>
std::deque<bool> match(const std::vector<T>& y, const std::vector<T>& x)
{
    std::vector<std::reference_wrapper<const T>> z {};
    z.reserve(std::min(y.size(), x.size()));

    std::set_intersection(y.cbegin(), y.cend(),
                          x.cbegin(), x.cend(),
                          std::back_inserter(z));

    std::deque<bool> result(y.size(), false);

    for (const auto& e : z) {
        result[std::distance(std::addressof(y.front()), std::addressof(e.get()))] = true;
    }

    return result;
}

int main() {

    const int NTESTS = 10;

    long long time1 = 0;
    long long time2 = 0;
    long long time3 = 0;
    long long time3_prime = 0;
    long long time4 = 0;
    long long time5 = 0;
    long long time6 = 0;

    for (int i = 0; i < NTESTS; ++i){

        std::srand ( unsigned ( std::time(0) ) );

        // 5000 is representative
        int n = 5000;

        std::vector<double> x (n);
        std::generate (x.begin(), x.end(), RandomNumber);

        std::vector<double> y (n);
        std::generate (y.begin(), y.end(), RandomNumber);

        for(std::vector<double>::const_iterator i = x.begin(); i != x.end(); i++) {
            y.push_back(*i);
        }

        std::sort(x.begin(), x.end());
        std::sort(y.begin(), y.end());

        vector<bool> z1(y.size());
        vector<unsigned char> z2(y.size());
        vector<unsigned char> z3(y.size());
        std::deque<bool> z3_prime;
        vector<bool> z4(y.size());
        std::vector<bool> z5(y.size());
        std::vector<bool> z6(y.size());

        // Original
        {
            auto start = std::chrono::high_resolution_clock::now();

            for (size_t i = 0; i != y.size(); ++i) {
                z1[i] = binary_search(x.begin(), x.end(), y[i]);
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time1 += duration.count();
        }

        // Original (replacing vector<bool> by vector<unsigned char>)
        {
            auto start = std::chrono::high_resolution_clock::now();

            for (size_t i = 0; i != y.size(); ++i) {
                z2[i] = binary_search(x.begin(), x.end(), y[i]);
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time2 += duration.count();
        }

        {  // Dieter Lücking set_intersection
            auto start = std::chrono::high_resolution_clock::now();

            size_t ix = 0;
            size_t iy = 0;
            while(ix < x.size() && iy < y.size())
            {
                if(x[ix] < y[iy]) ++ix;
                else if(y[iy] < x[ix]) ++iy;
                else {
                    z3[iy] = 1;
                    // ++ix; Not this if one vector is not uniquely sorted
                    ++iy;
                }
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time3 += duration.count();
        }

        // Std::set_intersection
        {
            auto start = std::chrono::high_resolution_clock::now();

            z3_prime = match(y, x);

            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time3_prime += duration.count();
        }

        { // Ed Heal
            auto start = std::chrono::high_resolution_clock::now();

            int i_x = 0, i_y = 0;
            while (i_x < x.size() && i_y < y.size())
            {
                if (x[i_x] == y[i_y]) {
                    //cout << "In both" << x[i_x] << endl;
                    z4[i_y] = true;
                    ++i_x;
                    ++i_y;
                } else if (x[i_x] < y[i_y]) {
                    ++i_x;
                } else {
                    z4[i_y] = false;
                    ++i_y;
                }
            }

           /* for (; i_y < y.size(); ++i_y) {
                //Empty
            } */
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time4 += duration.count();
        }

        { //  JacquesdeHooge
            auto start = std::chrono::high_resolution_clock::now();
            auto it_x = x.begin();
            int i = 0;
            for (; i < (int)y.size(); ++i) {
                it_x = std::lower_bound(it_x, x.end(), y[i]);
                if (it_x == x.end()) break;
                z5[i] = *it_x == y[i];
            }
            std::fill(z5.begin() + i, z5.end(), false);
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time5 += duration.count();
        }

        { // Skizz
            auto start = std::chrono::high_resolution_clock::now();
            vector<double>::iterator a = x.begin(), b = y.begin();
            int i = 0;
            while (a != x.end () && b != y.end ())
            {
                if (*a == *b) {
                    z6[i] = true;
                    ++a;
                    ++b;
                }
                else
                {
                    z6[i] = false;
                    if (*a < *b)
                    {
                        ++a;
                    }
                    else
                    {
                        ++b;
                    }
                }
                i++;
            }
            auto stop = std::chrono::high_resolution_clock::now();
            auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
            time6 += duration.count();
        }

        assert (std::equal(z1.begin(), z1.begin() + 5000, z2.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z3.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z3_prime.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z4.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z5.begin()));
        assert (std::equal(z1.begin(), z1.begin() + 5000, z6.begin()));
    }

    cout << "Original - vector<bool>: \t\t" << time1 << " ns\n";
    cout << "Original - vector<unsigned char>: \t" << time2 << " ns\n";
    cout << "Set intersection (Daniel): \t\t" << time3_prime << " ns\n";
    cout << "Set intersection (Dieter Lücking): \t" << time3 << " ns\n";
    cout << "Ed Heal: \t\t\t\t" << time4 << " ns\n";
    cout << "JackesdeHooge: \t\t\t\t" << time5 << " ns\n";
    cout << "Skizz: \t\t\t\t\t" << time6 << " ns\n";
    cout << endl;

    return 0;
}

我的结果与g ++ 5.2.1 -std :: c ++ 11和-O3:

原始 - 矢量:10152069 ns

原始 - 矢量:8686619 ns

设置交集(丹尼尔):1768855 ns

设置交叉点(DieterLücking):1617106 ns

Ed Heal:1446596 ns

JackesdeHooge:3998958 ns

Skizz:1385193 ns

*请注意Ed Heal和Skizz解决方案基本相同。

答案 2 :(得分:2)

由于两个向量都已排序,因此您必须仅对第二个向量的剩余部分应用bin搜索。

所以,如果你是在y [j]之前找不到x [i],你确定在y [j]之前你也找不到x [i + 1]。因此,在找到x [i + 1]的匹配时,从y [j]开始应用bin搜索就足够了。

答案 3 :(得分:2)

在我的脑海中,我只能想到这一点: -

vector<double>::iterator a = x.begin(), b = y.begin();

while (a != x.end () && b != y.end ())
{
  if (*a == *b)
  {
     // value is in both containers
     ++a;
  }
  else
  {
    if (*a < *b)
    {
      ++a;
    }
    else
    {
      ++b;
    }
  }
}

答案 4 :(得分:1)

也许这个算法会更好,因为两个向量是排序的。时间复杂度是线性的。

#include <iostream>
#include <algorithm>
#include <vector>

int main() {

    using namespace std;

    vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
    vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

    vector<bool> z(y.size());
    int i_x = 0, i_y = 0;
    while (i_x < x.size() && i_y < y.size())
    {
        if (x[i_x] == y[i_y]) {
            cout << "In both" << x[i_x] << endl;
            z[i_y] = true;
            ++i_x;
            ++i_y;
        } else if (x[i_x] < y[i_y]) {
            ++i_x;
        } else {
            z[i_y] = false;
            ++i_y;
        }
    }

    for (; i_y < y.size(); ++i_y) {
        //Empty
    }    
    for (vector<bool>::const_iterator i = z.begin(); i != z.end(); ++i)
        cout << *i << " ";

    return 0;
}

答案 5 :(得分:1)

@ JacquesdeHooge回答的实现:

std::vector<bool> ComputeMatchFlags(const std::vector<double>& x,
                                    const std::vector<double>& y) {
  std::vector<bool> found(y.size());
  auto it_x = x.begin();
  int i = 0;
  for (; i < (int)y.size(); ++i) {
    it_x = std::lower_bound(it_x, x.end(), y[i]);
    if (it_x == x.end()) break;
    found[i] = *it_x == y[i];
  }
  std::fill(found.begin() + i, found.end(), false);
  return found;
}

答案 6 :(得分:1)

当您找到一个元素(或元素中的一个元素)时,您不需要考虑之前出现的元素。因此,请使用先前查找的结果而不是x.begin()

由于std::binary_search未返回迭代器,因此请改用std::lower_bound。另请考虑std::find(是线性搜索,实际上可能更快,具体取决于您的数据)。

如果这没有带来足够的改进,请尝试std::unordered_set而不是数组。

答案 7 :(得分:1)

只是二进制搜索的时间并与使用std :: vector的改进设置交集:

#include <chrono>
#include <iostream>
#include <algorithm>
#include <vector>

int main() {
    using namespace std;

    // Original
    {
        vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
        vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

        auto start = std::chrono::high_resolution_clock::now();
        vector<bool> z(y.size());
        for (size_t i = 0; i != y.size(); ++i)
            z[i] = binary_search(x.begin(), x.end(), y[i]);
        auto stop = std::chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
        cout << "vector<bool>: " << duration.count() << "ns\n";
        for (auto i = z.begin(); i != z.end(); ++i)
            cout << unsigned(*i) << " ";
        cout << '\n';
    }

    // Original (replacing vector<bool> by vector<unsigned char>)
    {
        vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
        vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

        auto start = std::chrono::high_resolution_clock::now();
        vector<unsigned char> z(y.size());
        for (size_t i = 0; i != y.size(); ++i)
            z[i] = binary_search(x.begin(), x.end(), y[i]);
        auto stop = std::chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
        cout << "vector<unsigned char>: " << duration.count() << "ns\n";
        for (auto i = z.begin(); i != z.end(); ++i)
            cout << unsigned(*i) << " ";
        cout << '\n';
    }

    // Similar to std::set_intersection
    {
        vector<double> x = {1.8, 2.4, 3.3, 4.2, 5.6,7.9, 8.5, 9.3};
        vector<double> y = {0.5, 0.98, 1.8, 3.1, 5.6, 6.6, 9.3, 9.3, 9.5};

        auto start = std::chrono::high_resolution_clock::now();
        vector<unsigned char> z(y.size());
        size_t ix = 0;
        size_t iy = 0;
        while(ix < x.size() && iy < y.size())
        {
            if(x[ix] < y[iy]) ++ix;
            else if(y[iy] < x[ix]) ++iy;
            else {
                z[iy] = 1;
                // ++ix; Not this if one vector is not uniquely sorted
                ++iy;
            }
        }
        auto stop = std::chrono::high_resolution_clock::now();
        auto duration = chrono::duration_cast<chrono::nanoseconds>(stop - start);
        cout << "set intersection: " << duration.count() << "ns\n";
        for (auto i = z.begin(); i != z.end(); ++i)
            cout << unsigned(*i) << " ";
        cout << '\n';
    }
    return 0;
}

用g ++编译-std = c ++ 11 -O3(g ++ 4.84)给出:

vector<bool>: 3622ns
0 0 1 0 1 0 1 1 0 
vector<unsigned char>: 1635ns
0 0 1 0 1 0 1 1 0 
set intersection: 1299ns
0 0 1 0 1 0 1 1 0