考虑以下代码,该代码在 9x K20 下运行
__global__ void histogram( unsigned char *inputPointer, int *outputPointer)
{
__shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ];
__shared__ unsigned int partHist[ 256 ] ;
int i ;
int tx = threadIdx.x;
int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t val = inputPointer[ pixelOffset ];
uint8_t data = val/ 8 ;
uint8_t position = val % 8 ;
/**Trying to avoid loops thats why this code */
localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ;
__syncthreads();
TURN_ON( localDispersedHistogram [ tx ] [ data ] , position );
__syncthreads();
partHist[ tx ] = 0;
int k = 0 ;
for ( int i = 0 ; i < 256 ; i ++ ) {
k++;
}
}
现在,以下代码在访问共享变量时采用 72us :
__global__ void histogram( unsigned char *inputPointer, int *outputPointer)
{
__shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ];
__shared__ unsigned int partHist[ 256 ] ;
int i ;
int tx = threadIdx.x;
int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t val = inputPointer[ pixelOffset ];
uint8_t data = val/ 8 ;
uint8_t position = val % 8 ;
/**Trying to avoid loops thats why this code */
localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ;
__syncthreads();
TURN_ON( localDispersedHistogram [ tx ] [ data ] , position );
__syncthreads();
partHist[ tx ] = 0;
for ( int i = 0 ; i < 256 ; i ++ ) {
partHist[ tx ]++;
}
}
为什么共享访问会产生如此巨大的差异? 我理解共享访问比注册访问昂贵,但如果你查看上面的代码,行
TURN_ON( localDispersedHistogram [ tx ] [ data ] , position );
也在使用共享变量,为什么localDispersedHistogram的操作花费的时间更少,只有partHist访问需要疯狂的时间?
帮助。
更新: 道歉:
我的内核配置是&lt;&lt;&lt; 256,256&gt;&gt;&gt;
完整代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <conio.h>
#define SIZE_OF_OUTPUT_ARRAY 256 * 256 * 256
#define SIZE_OF_INPUT_ARRAY 256 * 256
#define TURN_ON(DATA,POSITION) (DATA|=(1<<(POSITION)))
__global__ void histogram( unsigned char *inputPointer, int *outputPointer)
{
#if 1
__shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ];
__shared__ long long partHist[ 256 ] ;
int i ;
int tx = threadIdx.x;
int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t val = inputPointer[ pixelOffset ];
uint8_t data = val/ 8 ;
uint8_t position = val % 8 ;
for ( int j = 0 ; j < 32 ; j++) {
localDispersedHistogram[ tx ] [ j ] = 0;
}
__syncthreads();
TURN_ON( localDispersedHistogram [ tx ] [ data ] , position );
__syncthreads();
//partHist[ tx ] = 0;
int sum = 0 ;
for ( int i = 0 ; i < 256 ; i ++ ) {
sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ;
}
partHist[ tx ] = sum;
atomicAdd( &outputPointer[ tx ] , partHist[ tx ] );
#endif
}
int main()
{
#if 1
printf(" Code Name, Sad buddy 17 ");
unsigned char *inputPointer = (unsigned char * ) malloc (SIZE_OF_INPUT_ARRAY);
for ( int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i ++ ) {
int t = rand() % 256 ;
//int t = 0;
inputPointer [ i ] = t;
}
unsigned char *device_inputPointer;
int *device_outputPointer;
cudaMalloc((void**)&device_inputPointer, SIZE_OF_INPUT_ARRAY);
cudaMemcpy( device_inputPointer, inputPointer , SIZE_OF_INPUT_ARRAY, cudaMemcpyHostToDevice );
cudaMalloc((void**)&device_outputPointer, 256 * sizeof ( int ) );
cudaMemset(device_outputPointer,0,256 * sizeof ( int ) );
histogram <<< 256 , 256 >>> ( device_inputPointer , device_outputPointer );
unsigned int *output = ( unsigned int * )malloc ( 256 * sizeof( int ));
cudaMemcpy( output, device_outputPointer , 256 * sizeof( int ), cudaMemcpyDeviceToHost );
unsigned int CPUHist [ 256 ] ;
unsigned int GPUHist [ 256 ] ;
for ( int i = 0 ; i < 256 ;i ++ ) {
CPUHist[ i ] = 0;
GPUHist [ i ] = 0;
//printf( " %d " , inputPointer[ i ]);
}
for ( int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i++ ) {
CPUHist[ inputPointer [ i ] ] ++;
}
int flag = 0 ;
for ( int i = 0 ; i < 256 ;i ++ ) {
printf(" %d GPUHist %d CPUHist\n" , output[ i ] , CPUHist[i]);
if (output[ i ] != CPUHist[i] ) {
flag = 1 ;
}
}
printf("\n\n======================\n\n");
if ( flag ) {
printf("TEST CASE FAIL ");
}
else {
printf("TEST CASE Pass");
}
printf("\n\n======================\n\n");
cudaDeviceReset();
#endif
getch();
return 0;
}
答案 0 :(得分:1)
由于您尚未发布两个完整案例进行比较,因此我会根据您的首次发布和更新来推断您的两个案例。
如果你有这样的代码:
int sum = 0 ;
int k = 0 ;
for ( int i = 0 ; i < 256 ; i ++ ) {
k++;
}
partHist[ tx ] = sum;
atomicAdd( &outputPointer[ tx ] , partHist[ tx ] );
(或者即使您的k
变量被sum
替换,也没关系)编译器可以找出总是会在partHist[tx]
中结束的内容而不是实际运行任何以前的代码。因此,它可以优化以前的代码(即删除它)并仍然得到相同的结果,它将这样做。因此,代码执行时间非常短,并且您获得了约9us的时序结果。
另一方面,当您的代码发布时:
int sum = 0 ;
for ( int i = 0 ; i < 256 ; i ++ ) {
sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ;
}
partHist[ tx ] = sum;
atomicAdd( &outputPointer[ tx ] , partHist[ tx ] );
然后这段代码依赖于前面的代码来确定结果,编译器无法对其进行优化。
您可以通过编译而不进行优化(nvcc -G ...
)或者使用cuobjdump -sass mycode
在每种情况下转储生成的汇编代码来获得对此的额外确认,您将发现一个主要区别由于编译器优化,汇编级别的内核代码。
每当对代码进行相对较小的更改,并且执行时间发生巨大变化时,我们应始终怀疑编译器优化的副作用。