我尝试从下面的代码中达到每个 SM 的最佳效果。峰值介于25 GFlops(GTX275-GT200 Arch。)之间。此代码最多可提供8个GFlops
__global__ void new_ker(float *x)
{
int index = threadIdx.x+blockIdx.x*blockDim.x;
float a,b;
a=0;
b=x[index];
//LOOP=10000000
//No. of blocks = 1
//Threads per block = 512 (I'm using GTX 275 - GT200 Arch.)
#pragma unroll 2048
for(int i=0;i<LOOP;i++){
a=a*b+b;
}
x[index] = a;
}
我不想在代码中增加ILP。任何想法为什么它没有达到峰值?
int main(int argc,char **argv)
{
//Initializations
float *x;
float *dx;
cudaEvent_t new_start,new_stop;
float elapsed;
double gflops;
x = 0;
flag = 0;
cudaMalloc((void **)&dx,sizeof(float)*THPB);
//ILP=1
cudaEventCreate(&new_start);
cudaEventCreate(&new_stop);
printf("Kernel1:\n");
cudaEventRecord(new_start, 0);
new_ker<<<BLOCKS,THPB>>>(dx);
cudaEventRecord(new_stop,0);
cudaEventSynchronize(new_stop);
cudaEventElapsedTime(&elapsed,new_start,new_stop);
x = (float *)malloc(sizeof(float)*THPB);
cudaMemcpy(x,dx,sizeof(float)*THPB,cudaMemcpyDeviceToHost);
gflops = ((double)(BLOCKS)*(THPB)*LOOP/elapsed)/1000000;
printf("\t%f",gflops);
cudaEventDestroy(new_start);
cudaEventDestroy(new_stop);
return 0;
}
平台: CUDA 3.0 NVIDIA GeForce GTX275(GT200)
答案 0 :(得分:4)
如果我使用正确的FLOP计算从您的代码中整理出一个完整的repro案例:
#include <stdio.h>
#define LOOP (10000000)
#define BLOCKS (30)
#define THPB (512)
__global__ void new_ker(float *x)
{
int index = threadIdx.x+blockIdx.x*blockDim.x;
float a,b;
a=0;
b=x[index];
#pragma unroll 2048
for(int i=0;i<LOOP;i++){
a=a*b+b;
}
x[index] = a;
}
int main(int argc,char **argv)
{
//Initializations
float *x;
float *dx;
cudaEvent_t new_start,new_stop;
float elapsed;
double gflops;
x = 0;
cudaMalloc((void **)&dx,sizeof(float)*THPB);
//ILP=1
cudaEventCreate(&new_start);
cudaEventCreate(&new_stop);
printf("Kernel1:\n");
cudaEventRecord(new_start, 0);
new_ker<<<BLOCKS,THPB>>>(dx);
cudaEventRecord(new_stop,0);
cudaEventSynchronize(new_stop);
cudaEventElapsedTime(&elapsed,new_start,new_stop);
x = (float *)malloc(sizeof(float)*THPB*BLOCKS);
cudaMemcpy(x,dx,sizeof(float)*THPB*BLOCKS,cudaMemcpyDeviceToHost);
gflops = 2.0e-6 * ((double)(LOOP)*double(THPB*BLOCKS)/(double)elapsed);
printf("\t%f\n",gflops);
cudaEventDestroy(new_start);
cudaEventDestroy(new_stop);
return 0;
}
我编译它并在64位Linux平台上使用CUDA 3.2在1.4GHz GTX275上运行它:
$ nvcc -arch=sm_13 -Xptxas="-v" -o perf perf.cu
ptxas info : Compiling entry function '_Z7new_kerPf' for 'sm_13'
ptxas info : Used 4 registers, 8+16 bytes smem, 8 bytes cmem[1]
$ ./perf
Kernel1:
671.806039
对于运行纯FMAD码(1.4 GHz * 2 FLOP * 8核/ MP * 30 MP)= 672 GFLOP / s的卡,我得到峰值FLOP / s的0.01%以内。
所以看起来代码实际上每个多处理器只有一个块达到峰值FLOP / s,但你只是没有正确计算FLOP / s数。