C ++代码：

#include <iostream>
#include <random>
#include <map>
#include <cmath>
#include <numeric> 
#include <algorithm>
#include <chrono>
#include <vector>     // std::iota

using namespace std;
using namespace std::chrono;

double edist(double* arr1, double* arr2, uint n) {
    double sum = 0.0;
    for (int i=0; i<n; i++) {
        sum += pow(arr1[i] - arr2[i], 2);
    }
    return sqrt(sum); }

template <typename T> vector<size_t> argsort(const vector<T> &v) {
  // initialize original index locations
  vector<size_t> idx(v.size());   iota(idx.begin(), idx.end(), 0);

  // sort indexes based on comparing values in v
  sort(idx.begin(), idx.end(),
       [&v](size_t i1, size_t i2) {return v[i1] < v[i2];});

  return std::vector<size_t>(idx.begin() + 1, idx.end()); }

int main() {

    uint N, M;
    // cin >> N >> M;
    N = 1000;
    M = 800;
    double **arr = new double*[N];
    std::random_device rd; // obtain a random number from hardware
    std::mt19937 eng(rd()); // seed the generator
    std::uniform_real_distribution<> distr(10.0, 60.0);

    for (int i = 0; i < N; i++) {
        arr[i] = new double[M];
        for(int j=0; j < M; j++) {
            arr[i][j] = distr(eng);
        }
    }
    auto start = high_resolution_clock::now();
    map<int, vector<size_t> > dist;

    for (int i=0; i<N; i++) {
        vector<double> distances;
        for(int j=0; j<N; j++) {
            distances.push_back(edist(arr[i], arr[j], N));
        }
        dist[i] = argsort(distances);
    }
    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop-start);
    int dur = duration.count();
    cout<<"Time taken by code: "<<dur<<" microseconds"<<endl;
    cout<<" In seconds: "<<dur/pow(10,6);  
        return 0; }

Python代码：

import time
import numpy as np
def comp_inner_raw(i, x):
    res = np.zeros(x.shape[0], dtype=np.float64)
    for j in range(x.shape[0]):
        res[j] = np.sqrt(np.sum((i-x[j])**2))
    return res
def nearest_ngbr_raw(x): # x = [[1,2,3],[4,5,6],[7,8,9]]
    #print("My array: ",x)
    dist = {}
    for idx,i in enumerate(x):
        #lst = []
        lst = comp_inner_raw(i,x)
        s = np.argsort(lst)#[1:]
        sorted_array = np.array(x)[s][1:]
        dist[idx] = s[1:]
    return dist
arr = np.random.rand(1000, 800)
start = time.time()
table = nearest_ngbr_raw(arr)
print("Time taken to execute the code using raw python is {}".format(time.time()-start))

编译命令：

g++ -std=c++11 knn.cpp -o knn

用于ubuntu 18.04.1的C ++编译器（g ++）版本： 7.4.0

以 c ++ 11

编码

数字版本：1.16.2

修改经过编译器优化的尝试，现在大约需要1秒钟。可以从编码或任何其他角度进一步优化此c ++代码吗？

Answer 1

可以从编码或任何其他角度进一步优化此c ++代码吗？

我可以看到至少三个优化。前两个很容易，并且绝对应该完成，但是在我的测试中，它们最终并没有对运行时产生可衡量的影响。第三个要求最少地重新考虑代码。

edist计算出代价昂贵的平方根，但您仅将距离用于成对比较。由于平方根函数单调增加，因此对比较结果没有影响。同样，pow(x, 2)可以替换为x * x，有时会更快：

double edist(std::vector<double> const& arr1, std::vector<double> const& arr2, uint n) {
    double sum = 0.0;
    for (unsigned int i = 0; i < n; i++) {
        auto const diff = arr1[i] - arr2[i];
        sum += diff * diff;
    }
    return sum;
}

argsort执行复制是因为它返回不包括第一个元素的索引。如果改为包含第一个元素（将return语句更改为return idx;），则避免了潜在的昂贵复制。
您的矩阵表示为嵌套数组（由于某种原因，您使用原始指针而不是嵌套的std::vector）。将矩阵表示为连续的N * M数组通常更有效：std::vector<double> arr(N * M);。这也是numpy在内部表示矩阵的方式。这需要更改代码以计算索引。

为什么C ++代码实现的性能不比python实现好？

C ++代码：

Python代码：

编译命令：

1 个答案: