Question

我的 3D Laplacian 解算器有效。我获得了350 Gflop / s的功率，我正在尝试升级它以获得更好的性能和两倍的块。但是，表演仍然是350 Gflop / s：

 #include <iostream>
 #include <sys/time.h>
 #include <cuda.h>
 #include <ctime>
 #include"res3dcb.cuh"
 #include <math.h>
 using namespace std;

 // Constant statement.
 const int blocksize=32;
 const int N=128;
 const int size=(N+2)*(N+2)*(N+2)*sizeof(float);

 // Let's start the main program.
 int main(void) {

 // Variable statement.
 float time1,time2,time3;
 float *x_d, *y_d; 
 float *x,*y; 
 float gflops;
 float NumOps;
 int power=4; // You can change power as you prefer (but keep 2^x)

 // Init x and y. 
 x = new float[size];
 y = new float[size];

 for (int k=1;k<N+1;k++)
    for (int i=1;i<N+1;i++) 
        for (int j=1;j<N+1;j++) { 
            x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
        }

 // Shadow cases.
 for (int k=1;k<N+1;k++) {
    for (int i=1;i<N+1;i++) { 
      x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1]; 
      x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}

    for (int j=0;j<N+2;j++) { 
      x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j]; 
      x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}

 for (int i=0;i<N+2;i++) 
    for (int j=0;j<N+2;j++) {
        x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
        x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
    }

 // Display of initial matrix.
 int id_stage=-2;
 while (id_stage!=-1) {
     cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
     cin>>id_stage;
     cout<<endl;

     if (id_stage != -1) {
    cout<<"Etage "<<id_stage<<" du cube :"<<endl;
    for (int i=0;i<N+2;i++) {
        cout<<"| ";
        for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
        cout<<"|"<<endl;
        }
         cout<<endl;
     }
 }

 // CPU to GPU.
 cudaMalloc( (void**) & x_d, size);
 cudaMalloc( (void**) & y_d, size);

 cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
 cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;

 // Solver parameters.
 dim3 dimGrid(power*N/blocksize, power*N/blocksize);
 dim3 dimBlock(blocksize, blocksize);

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

 // Power calculation.
 NumOps=(1.0e-9)*N*N*N*7;
 gflops = ( NumOps / (time3));

 // GPU to CPU.
 cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
 cudaFree(x_d);
 cudaFree(y_d);

 // Display of final matrix.
 id_stage=-2;
 while (id_stage!=-1) {
    cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
    cin>>id_stage;
    cout<<endl;

     if (id_stage != -1) {
        cout<<"Etage "<<id_stage<<" du cube :"<<endl;
        for (int i=0;i<N+2;i++) {
            cout<<"| ";
            for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
            cout<<"|"<<endl;
         }
        cout<<endl;
     }
 }

 cout<<"Time : "<<time3<<endl;
 cout<<"Gflops/s : "<<gflops<<endl;

 }

其中：

__ global__ void res2d(volatile float* x, float* y, int N, int power) 
{
    int i = threadIdx.x + blockIdx.x*(blockDim.x);
    int j = threadIdx.y + blockIdx.y*(blockDim.y);
    int id,jd;

    #pragma unroll //Now let's recude the number of operation per block
    for (int incr=1; incr<power+1; incr++) {
        if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
            #pragma unroll
            for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
                id=i-(incr-1)*N;
                jd=j-(incr-1)*N;
                y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd] 
                                                       + x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1] 
                                                       + x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1] 
                                                       - 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
            }   
        }
    }
}

带参数：

dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)

问题：

如果power = 2，4或8，则每个块的操作数除以2，4或8。但它并不快。为什么呢？
减少每个块的操作次数是没用的吗？

提前感谢您的帮助。

Answer 1

CUDA内核启动是异步的。当你这样做时：

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

计时器仅捕获API启动延迟，而不是代码的实际执行时间。这就是为什么改变内核中完成的工作量显然对性能没有影响 - 你的计时方法是不正确的。

做这样的事情：

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 
 cudaDeviceSynchronize();

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

这会插入一个阻塞调用，这将确保在测量时间之前内核已完成执行。

[此答案作为社区wiki条目添加，以便将问题从未答复的队列中删除]。

CUDA：用两倍的块编程（平铺？）

问题：

1 个答案: