__shared__变量表现奇怪的CUDA

时间:2014-02-06 08:47:55

标签: cuda

考虑以下代码,该代码在 9x K20 下运行

__global__ void histogram( unsigned char *inputPointer, int *outputPointer)
{

    __shared__ unsigned char localDispersedHistogram[ 256  ] [ 32 ];
    __shared__ unsigned int partHist[ 256 ] ;

    int i ;
    int tx = threadIdx.x;
    int pixelOffset = (blockIdx.x * blockDim.x) +  threadIdx.x;
    uint8_t val = inputPointer[ pixelOffset ];

    uint8_t  data = val/ 8 ;
    uint8_t  position = val % 8 ;

    /**Trying to avoid loops thats why this code */
    localDispersedHistogram [  tx ] [  tx % 32 ] = 0 ;

    __syncthreads();

    TURN_ON( localDispersedHistogram [  tx ] [  data ] , position );

     __syncthreads();

    partHist[ tx ] = 0;

    int k = 0 ;
    for ( int i = 0 ; i < 256 ; i ++ ) {
        k++;
    }

}

现在,以下代码在访问共享变量时采用 72us

__global__ void histogram( unsigned char *inputPointer, int *outputPointer)
{

    __shared__ unsigned char localDispersedHistogram[ 256  ] [ 32 ];
    __shared__ unsigned int partHist[ 256 ] ;

    int i ;
    int tx = threadIdx.x;
    int pixelOffset = (blockIdx.x * blockDim.x) +  threadIdx.x;
    uint8_t val = inputPointer[ pixelOffset ];

    uint8_t  data = val/ 8 ;
    uint8_t  position = val % 8 ;

    /**Trying to avoid loops thats why this code */
    localDispersedHistogram [  tx ] [  tx % 32 ] = 0 ;

    __syncthreads();

    TURN_ON( localDispersedHistogram [  tx ] [  data ] , position );

     __syncthreads();

    partHist[ tx ] = 0;




    for ( int i = 0 ; i < 256 ; i ++ ) {
        partHist[ tx ]++;
    }

}

为什么共享访问会产生如此巨大的差异? 我理解共享访问比注册访问昂贵,但如果你查看上面的代码,行

TURN_ON( localDispersedHistogram [  tx ] [  data ] , position );

也在使用共享变量,为什么localDispersedHistogram的操作花费的时间更少,只有partHist访问需要疯狂的时间?

帮助。

更新: 道歉:

我的内核配置是&lt;&lt;&lt; 256,256&gt;&gt;&gt;

完整代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <conio.h>


#define SIZE_OF_OUTPUT_ARRAY 256 * 256 * 256
#define SIZE_OF_INPUT_ARRAY 256 * 256 

#define TURN_ON(DATA,POSITION) (DATA|=(1<<(POSITION)))


__global__ void histogram( unsigned char *inputPointer, int *outputPointer)
{
    #if 1   
    __shared__ unsigned char localDispersedHistogram[ 256  ] [ 32 ];
    __shared__ long long partHist[ 256 ] ;

    int i ;
    int tx = threadIdx.x;
    int pixelOffset = (blockIdx.x * blockDim.x) +  threadIdx.x;
    uint8_t val = inputPointer[ pixelOffset ];

    uint8_t  data = val/ 8 ;
    uint8_t  position = val % 8 ;


    for ( int j =  0 ; j < 32 ; j++) {
        localDispersedHistogram[ tx ] [ j ] = 0;
    }

    __syncthreads();

    TURN_ON( localDispersedHistogram [  tx ] [  data ] , position );

     __syncthreads();

    //partHist[ tx ] = 0;

    int sum = 0 ;


    for ( int i = 0 ; i < 256 ; i ++ ) {
        sum += (localDispersedHistogram [  i ] [  tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ;
    }
    partHist[ tx ] = sum;

    atomicAdd( &outputPointer[ tx ] , partHist[ tx ]  );

    #endif

}



int main()
{

    #if 1
    printf(" Code Name, Sad buddy 17 ");

    unsigned char *inputPointer = (unsigned char * ) malloc (SIZE_OF_INPUT_ARRAY);

    for ( int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i ++ ) {

        int t = rand() % 256 ; 
        //int t = 0; 
        inputPointer [ i ]   = t;

    }

    unsigned char *device_inputPointer;
    int  *device_outputPointer;

    cudaMalloc((void**)&device_inputPointer, SIZE_OF_INPUT_ARRAY);
    cudaMemcpy( device_inputPointer,  inputPointer , SIZE_OF_INPUT_ARRAY, cudaMemcpyHostToDevice );
    cudaMalloc((void**)&device_outputPointer, 256 * sizeof ( int ) );
    cudaMemset(device_outputPointer,0,256 * sizeof ( int ) );

    histogram <<< 256 , 256 >>>  ( device_inputPointer , device_outputPointer  );


    unsigned int *output = ( unsigned int * )malloc ( 256 * sizeof( int ));

    cudaMemcpy( output,  device_outputPointer , 256 * sizeof( int ), cudaMemcpyDeviceToHost );





    unsigned int CPUHist [ 256 ]  ;
    unsigned int GPUHist [ 256 ]  ;

    for ( int i = 0 ; i < 256 ;i ++ ) {
        CPUHist[ i ] = 0;
        GPUHist [ i ] = 0;

        //printf( " %d " ,  inputPointer[ i ]);
    }


    for ( int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i++ ) {
        CPUHist[ inputPointer [ i ] ] ++;
    }





    int flag = 0 ;
    for ( int i = 0 ; i < 256 ;i ++ ) {
        printf(" %d  GPUHist  %d CPUHist\n" , output[ i ] , CPUHist[i]);

        if (output[ i ] != CPUHist[i]  ) {
            flag = 1 ;
        }
    }

    printf("\n\n======================\n\n");

    if ( flag ) {
        printf("TEST CASE FAIL ");
    }
    else {
        printf("TEST CASE Pass");
    }

    printf("\n\n======================\n\n");


    cudaDeviceReset();

    #endif
    getch();
    return 0;
}

1 个答案:

答案 0 :(得分:1)

由于您尚未发布两个完整案例进行比较,因此我会根据您的首次发布和更新来推断您的两个案例。

如果你有这样的代码:

int sum = 0 ;



int k = 0 ;
for ( int i = 0 ; i < 256 ; i ++ ) {
    k++;
}

partHist[ tx ] = sum;

atomicAdd( &outputPointer[ tx ] , partHist[ tx ]  );

(或者即使您的k变量被sum替换,也没关系)编译器可以找出总是会在partHist[tx] 中结束的内容而不是实际运行任何以前的代码。因此,它可以优化以前的代码(即删除它)并仍然得到相同的结果,它将这样做。因此,代码执行时间非常短,并且您获得了约9us的时序结果。

另一方面,当您的代码发布时:

int sum = 0 ;


for ( int i = 0 ; i < 256 ; i ++ ) {
    sum += (localDispersedHistogram [  i ] [  tx/ 8 ] & ( 1 << (tx % 8 ))) >> (tx % 8 ) ;
}
partHist[ tx ] = sum;

atomicAdd( &outputPointer[ tx ] , partHist[ tx ]  );

然后这段代码依赖于前面的代码来确定结果,编译器无法对其进行优化。

您可以通过编译而不进行优化(nvcc -G ...)或者使用cuobjdump -sass mycode在每种情况下转储生成的汇编代码来获得对此的额外确认,您将发现一个主要区别由于编译器优化,汇编级别的内核代码。

每当对代码进行相对较小的更改,并且执行时间发生巨大变化时,我们应始终怀疑编译器优化的副作用。