Question

我正在尝试并行化一个函数，该函数将三个数组（x，y和prb）和一个标量作为输入，并输出三个数组（P1，Pt1和Px）。

原始c代码在这里（异常值和E无关紧要）：

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B)   ((A) > (B) ? (A) : (B))
#define min(A, B)   ((A) < (B) ? (A) : (B))

void cpd_comp(
        double* x,
        double* y, 
        double* prb,
        double* sigma2,
        double* outlier,
        double* P1,
        double* Pt1,
        double* Px,
        double* E,
        int N,
        int M,
        int D
        )

{
  int       n, m, d;
  double    ksig, diff, razn, outlier_tmp, sp;
  double    *P, *temp_x;

  P = (double*) calloc(M, sizeof(double));
  temp_x = (double*) calloc(D, sizeof(double));

  ksig = -2.0 * *sigma2;


  for (n=0; n < N; n++) {

      sp=0;
      for (m=0; m < M; m++) {
          razn=0;
          for (d=0; d < D; d++) {
             diff=*(x+n+d*N)-*(y+m+d*M);  diff=diff*diff;
             razn+=diff;
          }

          *(P+m)=exp(razn/ksig) ;
          sp+=*(P+m);
      }


      *(Pt1+n)=*(prb+n);
      for (d=0; d < D; d++) {
       *(temp_x+d)=*(x+n+d*N)/ sp;
      }

      for (m=0; m < M; m++) {
          *(P1+m)+=((*(P+m)/ sp) **(prb+n));

          for (d=0; d < D; d++) {
          *(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
          }

      }

   *E +=  -log(sp);     
  }
  *E +=D*N*log(*sigma2)/2;


  free((void*)P);
  free((void*)temp_x);

  return;
}

以下是我尝试并行化的方法：

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>

/*headers*/
void cpd_comp(
    float * x,        //Points to register      [N*D]
    float * y,        //Points to be registered [M*D]
    float * prb,      //Vector of probabilities [N]
    float * sigma2,   //Square of sigma
    float ** P1,       //P1,  output, [M]
    float ** Pt1,      //Pt1, output, [N]
    float ** Px,       //Px,  output, [M*3]
    int N,            //Number of points, i.e. rows, in x
    int M             //Number of points, i.e. rows, in 
    );

__global__ void d_computeP(
    float * P,
    float * P1,
    float * Px,
    float * ProbabilityMatrix,
    float * x,
    float * y,
    float * prb,
    float ksig,
    const int N,
    const int M);

__global__ void d_sumP(
    float * sp,
    float * P1timessp,
    float * Pxtimessp,
    float * P1,
    float * Px,
    const int N,
    const int M);

/*implementations*/

void cpd_comp(
    float * x,        //Points to register      [N*D]
    float * y,        //Points to be registered [M*D]
    float * prb,      //Vector of probabilities [N]
    float * sigma2,   //Scalar
    float ** P1,       //P1,  output, [M]
    float ** Pt1,      //Pt1, output, [N]
    float ** Px,       //Px,  output, [M*3]
    int N,            //Number of points, i.e. rows, in x
    int M             //Number of points, i.e. rows, in y
    ){
    //X is generatedPointPos
    //Y is points

    float
        *P,
        *P1timessp,
        *Pxtimessp,
        ksig = -2.0 * (*sigma2),
        *h_sumofP = new float[N], //sum of P, on host
        *d_sumofP;                //sum of P, on device

    cudaMalloc((void**)&P,        sizeof(float)*M*N);
    cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
    cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
    cudaMalloc((void**)&d_sumofP, sizeof(float)*N);

    cudaMalloc((void**)P1,        sizeof(float)*M);
    cudaMalloc((void**)Px,        sizeof(float)*M*3);
    cudaMalloc((void**)Pt1,       sizeof(float)*N);

    d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);

    for(int n=0; n<N; n++){
        thrust::device_ptr<float>dev_ptr(P);
        h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
    }

    cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);

    d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);

    cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);

    cudaFree(P);
    cudaFree(P1timessp);
    cudaFree(Pxtimessp);
    cudaFree(d_sumofP);
    delete[]h_sumofP;
}

/*kernels*/

__global__ void d_computeP(
    float * P,
    float * P1,
    float * Px,
    float * ProbabilityMatrix,
    float * x,
    float * y,
    float * prb,
    float ksig,
    const int N,
    const int M){
    //thread configuration: <<<dim3(N,M/1024+1),1024>>>
    int m = threadIdx.x+blockIdx.y*blockDim.x;
    int n = blockIdx.x;
    if(m>=M || n>=N) return;

    float 
        x1 = x[3*n],
        x2 = x[3*n+1],
        x3 = x[3*n+2],
        diff1 = x1 - y[3*m],
        diff2 = x2 - y[3*m+1],
        diff3 = x3 - y[3*m+2],
        razn = diff1*diff1+diff2*diff2+diff3*diff3,

        Pm = __expf(razn/ksig), //fast exponentiation
        prbn = prb[n];

    P[M*n+m] = Pm; 

    __syncthreads();

    P1[N*m+n] = Pm*prbn;
    Px[3*(N*m+n)+0] = x1*Pm*prbn;
    Px[3*(N*m+n)+1] = x2*Pm*prbn;
    Px[3*(N*m+n)+2] = x3*Pm*prbn;
}

__global__ void d_sumP(
    float * sp,
    float * P1timessp,
    float * Pxtimessp,
    float * P1,
    float * Px,
    const int N,
    const int M){
    //computes P1 and Px
    //thread configuration: <<<M/1024+1,1024>>>
    int m = threadIdx.x+blockIdx.x*blockDim.x;
    if(m>=M) return;
    float 
        P1m = 0,
        Pxm1 = 0,
        Pxm2 = 0,
        Pxm3 = 0;
    for(int n=0; n<N; n++){
        float spn = 1/sp[n];
        P1m += P1timessp[N*m+n]*spn;
        Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
        Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
        Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
    }

    P1[m] = P1m;
    Px[3*m+0] = Pxm1;
    Px[3*m+1] = Pxm2;
    Px[3*m+2] = Pxm3;

}

然而，令我惊讶的是，它运行得比原始版本慢得多。如何让它运行得更快？请仔细解释一下，因为我对CUDA和并行编程都很陌生并且没有算法经验。

请注意，c版本具有列主要排序，而CUDA版本具有行主要版本。我做了几次测试，以确保结果是正确的。它只是非常慢，占用了大量的内存。

非常感谢任何帮助！

编辑：更多信息：N和M大约是几千（例如，300-3000），D总是3.CUDA版本需要数组 device 内存，除了对于以h _。为前缀的变量。

Answer 1

在尝试任何特定于CUDA的优化之前，请对代码进行概要分析，以查看花费的时间。

尝试并排列数组读/写，以便每个CUDA线程使用跨步访问模式。例如，目前你有

int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;

diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],

因此，线程1将从y[0],y[1],y[2]等读取。相反，重新排列数据，以便线程1从y[0],y[M],y[2*M]读取，线程2从y[1],y[M+1],y[2*M+1]等读取。您应该遵循此访问模式对于其他阵列。

此外，您可能需要考虑是否可以避免使用__syncthreads()。我不太理解为什么在这个算法中有必要，可能值得删除它以查看它是否能提高性能（即使它产生不正确的结果）。

Answer 2

良好的CUDA性能的关键几乎总是尽可能接近最佳内存访问。您的内存访问模式看起来与矩阵乘法非常相似。我将从一个很好的CUDA矩阵乘法实现开始，确保理解为什么它的实现方式，然后根据您的需要进行修改。

如何以有效的方式并行化这个三重循环？

2 个答案: