Question

我正在尝试在CUDA中实现强力距离计算算法。

#define VECTOR_DIM 128
thrust::device_vector<float> feature_data_1;
feature_data_1.resize(VECTOR_DIM * 1000); // 1000 128 dimensional points
thrust::device_vector<float> feature_data_2;
feature_data_2.resize(VECTOR_DIM * 2000); // 2000 128 dimensional points

现在我想要做的是计算第一个矩阵中每个向量与第二个矩阵中每个向量的L2距离（平方差的和）。

因此，如果数组1的大小为1000而数组2的大小为2000，则结果将是1000*2000的浮点矩阵大小。

我想知道是否有办法单独使用Thrust算法来实现这一点。

Answer 1

计算CUDA中两个不同集合中各点之间的全对距离可以通过观察来解决

||x-y||^2=||x||^2+||y||^2-2*<x,y>

其中|| ||是l2标准，<x,y>表示x和y之间的标量积。

规范||x||和||y||可以通过Reduce matrix rows with CUDA启发的方法计算，而标量积<x,y>则可以计算为矩阵 - 矩阵乘法{{ 1}}使用X*Y^T。

以下是一个完整的实施方案。请注意，为了计算规范cublas<t>gemm()，我们报告了两种方法，一种使用|| || cuBLAS，一种使用Thurst的cublas<t>gemv。对于你感兴趣的问题，我在GT540M卡上经历了以下时间：

transform

Approach nr. 1 0.12ms Approach nr. 2 0.59ms include <cublas_v2.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/generate.h> #include <thrust/reduce.h> #include <thrust/functional.h> #include <thrust/random.h> #include <thrust/sequence.h> #include <stdio.h> #include <iostream> #include "Utilities.cuh" #include "TimingGPU.cuh" #define BLOCK_SIZE_X 16 #define BLOCK_SIZE_Y 16 /***********************************************************/ /* SQUARED ABSOLUTE VALUE FUNCTOR - NEEDED FOR APPROACH #1 */ /***********************************************************/ struct abs2 { __host__ __device__ double operator()(const float &x) const { return x * x; } }; // --- Required for approach #2 __device__ float *vals; /******************************************/ /* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ /******************************************/ struct row_reduction { const int Ncols; // --- Number of columns row_reduction(int _Ncols) : Ncols(_Ncols) {} __device__ float operator()(float& x, int& y ) { float temp = 0.f; for (int i = 0; i<Ncols; i++) temp += vals[i + (y*Ncols)] * vals[i + (y*Ncols)]; return temp; } }; /************************************************/ /* KERNEL FUNCTION TO ASSEMBLE THE FINAL RESULT */ /************************************************/ __global__ void assemble_final_result(const float * __restrict__ d_norms_x_2, const float * __restrict__ d_norms_y_2, float * __restrict__ d_dots, const int NX, const int NY) { const int i = threadIdx.x + blockIdx.x * gridDim.x; const int j = threadIdx.y + blockIdx.y * gridDim.y; if ((i < NY) && (j < NX)) d_dots[i * NX+ j] = d_norms_x_2[j] + d_norms_y_2[i] - 2 * d_dots[i * NX+ j]; } /********/ /* MAIN */ /********/ int main() { //const int Ndims = 128; // --- Number of rows //const int NX = 1000; // --- Number of columns //const int NY = 2000; // --- Number of columns const int Ndims = 3; // --- Number of rows const int NX = 4; // --- Number of columns const int NY = 5; // --- Number of columns // --- Random uniform integer distribution between 10 and 99 thrust::default_random_engine rng; thrust::uniform_int_distribution<int> dist(10, 99); // --- Matrices allocation and initialization thrust::device_vector<float> d_X(Ndims * NX); thrust::device_vector<float> d_Y(Ndims * NY); for (size_t i = 0; i < d_X.size(); i++) d_X[i] = (float)dist(rng); for (size_t i = 0; i < d_Y.size(); i++) d_Y[i] = (float)dist(rng); TimingGPU timerGPU; // --- cuBLAS handle creation cublasHandle_t handle; cublasSafeCall(cublasCreate(&handle)); /**********************************************/ /* CALCULATING THE NORMS OF THE ELEMENTS OF X */ /**********************************************/ thrust::device_vector<float> d_norms_x_2(NX); // --- Approach nr. 1 //timerGPU.StartCounter(); thrust::device_vector<float> d_X_2(Ndims * NX); thrust::transform(d_X.begin(), d_X.end(), d_X_2.begin(), abs2()); thrust::device_vector<float> d_ones(Ndims, 1.f); float alpha = 1.f; float beta = 0.f; cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NX, &alpha, thrust::raw_pointer_cast(d_X_2.data()), Ndims, thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_x_2.data()), 1)); //printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); // --- Approach nr. 2 //timerGPU.StartCounter(); // float *s_vals = thrust::raw_pointer_cast(&d_X[0]); // gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); // thrust::transform(d_norms_x_2.begin(), d_norms_x_2.end(), thrust::counting_iterator<int>(0), d_norms_x_2.begin(), row_reduction(Ndims)); //printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); /**********************************************/ /* CALCULATING THE NORMS OF THE ELEMENTS OF Y */ /**********************************************/ thrust::device_vector<float> d_norms_y_2(NX); thrust::device_vector<float> d_Y_2(Ndims * NX); thrust::transform(d_Y.begin(), d_Y.end(), d_Y_2.begin(), abs2()); cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_T, Ndims, NY, &alpha, thrust::raw_pointer_cast(d_Y_2.data()), Ndims, thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_norms_y_2.data()), 1)); /***********************************/ /* CALCULATING THE SCALAR PRODUCTS */ /***********************************/ thrust::device_vector<float> d_dots(NX * NY); cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, NX, NY, Ndims, &alpha, thrust::raw_pointer_cast(d_X.data()), Ndims, thrust::raw_pointer_cast(d_Y.data()), Ndims, &beta, thrust::raw_pointer_cast(d_dots.data()), NX)); /*****************************/ /* ASSEMBLE THE FINAL RESULT */ /*****************************/ dim3 dimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y); dim3 dimGrid(iDivUp(NX, BLOCK_SIZE_X), iDivUp(NY, BLOCK_SIZE_Y)); assemble_final_result<<<dimGrid, dimBlock>>>(thrust::raw_pointer_cast(d_norms_x_2.data()), thrust::raw_pointer_cast(d_norms_y_2.data()), thrust::raw_pointer_cast(d_dots.data()), NX, NY); for(int i = 0; i < NX * NY; i++) std::cout << d_dots[i] << "\n"; return 0; }和Utilities.cu个文件被隐藏here，此处省略。 Utilities.cuh和TimingGPU.cu维护here，也会被省略。

使用CUDA计算不同集合中各点之间的所有对距离

1 个答案: