静态与动态CUDA共享内存分配的性能

时间:2011-07-07 18:45:11

标签: cuda

我有2内核做同样的事情。其中一个静态分配共享内存,而另一个在运行时动态分配内存。我使用共享内存作为2D数组。所以对于动态分配,我有一个计算内存位置的宏。现在,2内核生成的结果完全相同。但是,我从两个内核得到的时序结果相隔3倍!静态内存分配要快得多。很抱歉,我无法发布任何代码。有人可以为此辩护吗?

1 个答案:

答案 0 :(得分:2)

我没有证据表明静态共享内存分配比动态共享内存分配更快。正如上面的评论所证明的那样,没有复制者就无法回答你的问题。至少在下面的代码中,当使用静态或动态共享内存分配运行时,同一内核的时序完全相同:

#include <cuda.h>
#include <stdio.h>

#define BLOCK_SIZE 512

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/***********************************/
/* SHARED MEMORY STATIC ALLOCATION */
/***********************************/
__global__ void kernel_static_memory_allocation(int *d_inout, int N)
{
    __shared__ int s[BLOCK_SIZE];

    const int tid   = threadIdx.x;
    const int i     = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) {

        s[tid] = d_inout[i];
        __syncthreads();

        s[tid] = s[tid] * s[tid];
        __syncthreads();

        d_inout[i] = s[tid];
    }
}

/************************************/
/* SHARED MEMORY DYNAMIC ALLOCATION */
/************************************/
__global__ void kernel_dynamic_memory_allocation(int *d_inout, int N)
{
    extern __shared__ int s[];

    const int tid   = threadIdx.x;
    const int i     = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N) {

        s[tid] = d_inout[i];
        __syncthreads();

        s[tid] = s[tid] * s[tid];
        __syncthreads();

        d_inout[i] = s[tid];
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    int N = 1000000;

    int* a = (int*)malloc(N*sizeof(int));

    for (int i = 0; i < N; i++) { a[i] = i; }

    int *d_inout; gpuErrchk(cudaMalloc(&d_inout, N * sizeof(int))); 

    int n_blocks = N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1);

    gpuErrchk(cudaMemcpy(d_inout, a, N*sizeof(int), cudaMemcpyHostToDevice));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);  
    kernel_static_memory_allocation<<<n_blocks,BLOCK_SIZE>>>(d_inout, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Static allocation - elapsed time:  %3.3f ms \n", time);

    cudaEventRecord(start, 0);  
    kernel_dynamic_memory_allocation<<<n_blocks,BLOCK_SIZE,BLOCK_SIZE*sizeof(int)>>>(d_inout, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Dynamic allocation - elapsed time:  %3.3f ms \n", time);

}

可能的原因是由于两个内核的反汇编代码完全相同,即使用N = 1000000;替换int int N = rand();也不会改变。