我正在对GTX680进行一些性能CUDA测试,并且想知道是否有人可以帮助我理解为什么我会得到以下性能结果。我正在运行的代码如下:
#include <stdio.h>
using namespace std;
__global__ void test_hardcoded(int rec,int * output)
{
int a;
int rec2=rec/2;
if(threadIdx.x==1000) *output=rec;
if(threadIdx.x==1000) *(output+1)=rec2;
for (int i=0;i<10000;i++)
{
__syncthreads();
a+=i;
}
if(threadIdx.x==1000) *output=a; //will never happen but should fool compiler as to not skip the for loop
}
__global__ void test_softcoded(int rec,int * output)
{
int a;
int rec2=rec/2; //This should ensure that we are using the a register not constant memory
if(threadIdx.x==1000) *output=rec;
if(threadIdx.x==1000) *(output+1)=rec2;
for (int i=0;i<=rec2;i++)
{ __syncthreads();
a+=i;
}
if(threadIdx.x==1000) *output=a; //will never happen but should fool compiler as to not skip the for loop
}
int main(int argc, char *argv[])
{
float timestamp;
cudaEvent_t event_start,event_stop;
// Initialise
cudaSetDevice(0);
cudaEventCreate(&event_start);
cudaEventCreate(&event_stop);
cudaEventRecord(event_start, 0);
dim3 threadsPerBlock;
dim3 blocks;
threadsPerBlock.x=32;
threadsPerBlock.y=32;
threadsPerBlock.z=1;
blocks.x=1;
blocks.y=1000;
blocks.z=1;
cudaEventRecord(event_start);
test_hardcoded<<<blocks,threadsPerBlock,0>>>(10000,NULL);
cudaEventRecord(event_stop, 0);
cudaEventSynchronize(event_stop);
cudaEventElapsedTime(×tamp, event_start, event_stop);
printf("test_hardcoded() took %fms \n", timestamp);
cudaEventRecord(event_start);
test_softcoded<<<blocks,threadsPerBlock,0>>>(20000,NULL);
cudaEventRecord(event_stop, 0);
cudaEventSynchronize(event_stop);
cudaEventElapsedTime(×tamp, event_start, event_stop);
printf("test_softcoded() took %fms \n", timestamp);
}
根据代码我正在运行两个内核。他们所做的只是循环和添加。唯一的区别是test_softcoded()循环与寄存器进行比较,而test_hardcoded()直接与硬编码整数进行比较。
当我运行上面的代码时,我得到以下结果
$ nvcc -arch=sm_30 test7.cu
$ ./a.out
test_hardcoded() took 51.353985ms
test_softcoded() took 99.209694ms
test_hardcoded()函数比test-softcoded()快两倍!!!!
据我所知,在test_softcoded()中有一个潜在的读取后写入注册表依赖,但我的意识是注册表延迟完全隐藏为高占用率,它应该非常高),所以我想知道可能是什么问题以及如何提高test_softcoded()的性能。
答案 0 :(得分:1)
由于这个硬编码值,编译器可以进行一些优化,例如循环展开,这可能会使性能提高一些。这可能是原因。
您可以通过在“test_softcoded”中添加一些展开到for循环来检查它 在'for(int i = 0; i&lt; = rec2; i ++)'之前添加'#pragma unroll 5000'之类的代码并运行它将解决您的疑问。