Question

我有一个计算1-10百万个标量积的程序。

看起来像这样。 ts和A是大约1000 - 10000个3D点的数组（每个元素都是3x1向量）。目前，对于ts.size() = 10,000和A.size() = 1000，我的代码大约需要41ms。到目前为止，我还没有进行任何并行化。计算会更快，例如，在CUDA中吗？我没有这样的经历。或者还有其他方法吗？感谢。

for(int i = 0; i< ts.size(); i++){
    for(int j = 0; j< A.size(); j++){
        if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
            score[i] +=1;
        }
    }
}

这是我对标量产品的实现。

double scalarProduct(const Point &p1,const Point &p2)
{
return (p1.getX()*p2.getX() + p1.getY()*p2.getY() + p1.getZ()*p2.getZ()) ;
}

我可以使用Lapack或Eigen，将问题表示为矩阵乘法吗？我在Matlab中做到了这一点，它只慢了5倍。任何加速都会很棒。使用OpenMP我想我可以4x更快。

Answer 1

这个答案由两部分组成：

加速计算许多独立的标量产品;
解决您的具体问题。

第1部分

计算大量独立标量积的问题是一个令人难以置信的并行问题。如果你的目标只是加速上面提到的标量产品，在CPU上保留其余的计算，那么我同意Calvin的观点，即大部分时间将花在设备上 - ＆gt;大N*M结果矩阵的内存事务。但是，如果您从提到的交易中清除时间，加速计算将是值得的。下面的代码显示了这一点，在Intel Xeon E5-2650 2.00 GHz，配备NVIDIA Kepler K20c卡的八核处理器上进行了测试，其时间如下：

CPU: 27ms;     GPU (without D2H transaction): 0.08ms;     GPU (with D2H transaction): 23ms

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
    return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}

/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z, 
              const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z, 
              float* __restrict__ output, const int N, const int M) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((idx < N) && (idy < M))

        output[idy * N + idx] = scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy]);

}

/********/
/* MAIN */
/********/
int main() {

    const int N = 10000;
    const int M = 1000;

    // --- Host side allocations
    float *Ax = (float*)malloc(N*sizeof(float));
    float *Ay = (float*)malloc(N*sizeof(float));
    float *Az = (float*)malloc(N*sizeof(float));

    float *Bx = (float*)malloc(M*sizeof(float));
    float *By = (float*)malloc(M*sizeof(float));
    float *Bz = (float*)malloc(M*sizeof(float));

    float *C = (float*)malloc(N*M*sizeof(float));
    float *D = (float*)malloc(N*M*sizeof(float));

    // --- Device side allocations
    float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
    float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
    float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));

    float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
    float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
    float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));

    float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*M*sizeof(float)));

    // --- Initialization
    srand(time(NULL));
    for (int i=0; i<N; i++) {
        Ax[i] = rand() / RAND_MAX;
        Ay[i] = rand() / RAND_MAX;
        Az[i] = rand() / RAND_MAX;
    }

    for (int i=0; i<M; i++) {
        Bx[i] = rand() / RAND_MAX;
        By[i] = rand() / RAND_MAX;
        Bz[i] = rand() / RAND_MAX;
    }

    // --- Host side computations
    double t1 = clock();
    for (int i=0; i<N; i++) 
        for (int j=0; j<M; j++) 
            C[i*M + j] = scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j]);
    double t2 = clock();
    printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);

    // --- Device side computations
    dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);
    dim3 dimGrid(iDivUp(N, BLOCKSIZE_X), iDivUp(M, BLOCKSIZE_Y));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // --- Host to device memory transfers
    gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));

    // --- Computations
    kernel<<<dimGrid, dimBlock>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(D, d_C, N*M*sizeof(float), cudaMemcpyDeviceToHost));

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time:  %3.4f ms \n", time);


    for (int i=0; i<N*M; i++) {
        if (D[i] != C[i]) {
            printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
            return 1;
        }
    }
    printf("Results match!\n");
    cudaDeviceReset();
    return 0;
}

第2部分

为了解决您的具体问题，即使考虑到D2H内存事务（非常便宜），CUDA也是值得的。下面的代码证实了这一点，在与上述相同的系统上进行了测试，其时间如下：

CPU: 46ms;     GPU (with D2H transaction): 0.31ms;

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
    return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}

/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z, 
              const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z, 
              float* __restrict__ output, const int N, const int M) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((idx < N) && (idy < M))

        if(abs(scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy])) < 0.01f) 
            output[idx] = 1.f;
        else
            output[idx] = 0.f;

}

/********/
/* MAIN */
/********/
int main() {

    const int N = 10000;
    const int M = 1000;

    // --- Host side allocations
    float *Ax = (float*)malloc(N*sizeof(float));
    float *Ay = (float*)malloc(N*sizeof(float));
    float *Az = (float*)malloc(N*sizeof(float));

    float *Bx = (float*)malloc(M*sizeof(float));
    float *By = (float*)malloc(M*sizeof(float));
    float *Bz = (float*)malloc(M*sizeof(float));

    float *C = (float*)malloc(N*sizeof(float));
    float *D = (float*)malloc(N*sizeof(float));

    // --- Device side allocations
    float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
    float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
    float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));

    float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
    float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
    float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));

    float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*sizeof(float)));

    // --- Initialization
    srand(time(NULL));
    for (int i=0; i<N; i++) {
        Ax[i] = rand() / RAND_MAX;
        Ay[i] = rand() / RAND_MAX;
        Az[i] = rand() / RAND_MAX;
    }

    for (int i=0; i<M; i++) {
        Bx[i] = rand() / RAND_MAX;
        By[i] = rand() / RAND_MAX;
        Bz[i] = rand() / RAND_MAX;
    }

    // --- Host side computations
    double t1 = clock();
    for (int i=0; i<N; i++) 
        for (int j=0; j<M; j++)
            if(abs(scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j])) < 0.01f) 
                C[i] = 1.f;
            else
                C[i] = 0.f;
    double t2 = clock();
    printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);

    // --- Device side computations
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // --- Host to device memory transfers
    gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));

    // --- Computations
    kernel<<<iDivUp(N, BLOCKSIZE_X), BLOCKSIZE_X>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(D, d_C, N*sizeof(float), cudaMemcpyDeviceToHost));

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time:  %3.4f ms \n", time);


    for (int i=0; i<N; i++) {
        if (D[i] != C[i]) {
            printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
            return 1;
        }
    }
    printf("Results match!\n");
    cudaDeviceReset();
    return 0;
}

Answer 2

不应优化算术，而应首先使用更好的算法。在大多数实际情况中，每个周期ts和A并不是完全随机的，你可以在某种程度上在空间上组织（排序）它们，并大大减少计算空间度量的需要。

现在，如果你坚持使用当前的算法，你可以让编译器发出SSE代码，这样可以在没有任何编程工作的情况下立即提升。

既然你必须提出这个问题，那么你可以通过使用编译器内在函数手动编码来进一步挤压周期的可能性相对较小。

关于CUDA，仅仅1000万个点产品，CPU-RAM-DISPLAY RAM-GPU通信的开销非常大，不值得一试。

Answer 3

要使用MIMD和OpenMP并行化，可以执行以下操作：

#pragma omp parallel for
for(int i = 0; i< ts.size(); i++){     
    for(int j = 0; j< A.size(); j++){
        if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
            score[i] +=1;
        }
    }
}

您也可以考虑使用SIMD。在这种情况下，您应该更改数据结构并存储等于SIMD宽度的点块（对于带有浮点的SSE，为4）。像

这样的东西

class PointBlock4 {
    float x[4];
    float y[4];
    float z[4];
    //
}

每个区块有四个点。这显然更复杂，但它是可以实现的。你也可以加快速度。结合SIMD和MIMD，您可以获得16倍的速度（具有四个核心）。但是对于大n，你的算法将成为内存绑定而不是计算绑定，因此你将获得更低的加速。事实上，您的算法可能已经受到内存限制，因此您可以通过SIMD或MIMD实现更多功能。我会首先测试OpenMP，看看你是否获得了很多。

快速计算许多标量积

3 个答案: