未指定的启动失败 - CUDA中的并行扫描

时间:2014-06-06 17:17:16

标签: c cuda

我正在使用GeForce GT 520(计算能力v2.1)来运行一个程序,该程序对int个元素的数组执行扫描操作。这是代码:

This is an implementation of the parallel scan algorithm.
Only a single block of threads is used. Maximum array size = 2048

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define errorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
if (code != cudaSuccess) 
    fprintf(stderr,"GPUassert: %s, file: %s line: %d\n", cudaGetErrorString(code), file, line);
    if (abort) exit(code);

__global__ void blelloch_scan(int* d_in, int* d_out, int n) 
    extern __shared__ int temp[];// allocated on invocation

    int thid = threadIdx.x;
    int offset = 1;

    temp[2*thid] = d_in[2*thid]; // load input into shared memory
    temp[2*thid+1] = d_in[2*thid+1];

    // build sum in place up the tree
    for (int d = n>>1; d > 0; d >>= 1)
        if (thid < d)
            int ai = offset*(2*thid+1)-1;
            int bi = offset*(2*thid+2)-1;
            temp[bi] += temp[ai];
        offset *= 2;

    // clear the last element
    if (thid == 0)
    temp[n - 1] = 0; 

    // traverse down tree & build scan
    for (int d = 1; d < n; d *= 2)
        offset >>= 1;
        if (thid < d)
            int ai = offset*(2*thid+1)-1;
            int bi = offset*(2*thid+2)-1;
            int t = temp[ai];
            temp[ai] = temp[bi];
            temp[bi] += t;

    d_out[2*thid] = temp[2*thid]; // write results to device memory
    d_out[2*thid+1] = temp[2*thid+1];

int main(int argc, char **argv)
if(argc != 2)
    printf("Input Syntax: ./a.out <number-of-elements>\nProgram terminated.\n");
    exit (1);        
ARRAY_SIZE = (int) atoi(*(argv+1));

int *h_in, *h_out, *d_in, *d_out, i;
h_in = (int *) malloc(sizeof(int) * ARRAY_SIZE);
h_out = (int *) malloc(sizeof(int) * ARRAY_SIZE);

cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, 0) == 0)
    printf("Using device %d:\n", 0);
    printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
           devProps.name, (int)devProps.totalGlobalMem, 
           (int)devProps.major, (int)devProps.minor, 

for(i = 0; i < ARRAY_SIZE; i++)
    h_in[i] = i;    

errorCheck(cudaMalloc((void **) &d_in, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMalloc((void **) &d_out, sizeof(int) * ARRAY_SIZE));    
errorCheck(cudaMemcpy(d_in, h_in, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice));

blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>> (d_in, d_out, ARRAY_SIZE);

errorCheck(cudaMemcpy(h_out, d_out, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost));

for(i = 0; i < ARRAY_SIZE; i++)
    printf("h_in[%d] = %d, h_out[%d] = %d\n", i, h_in[i], i, h_out[i]);    
return 0;

在使用nvcc -arch=sm_21 parallel-scan.cu -o parallel-scan进行编译时,出现错误: GPUassert: unspecified launch failure, file: parallel-scan-single-block.cu line: 106



  • 从内核中可以看出,如果一个块有1000个线程,它可以在2000个元素上运行。因此,blockSize = ARRAY_SIZE / 2。
  • 并且,共享内存= sizeof(int)* ARRAY_SIZE
  • 所有内容都加载到共享内存中。然后,完成向上扫描,最后一个元素设置为0.最后,完成向下扫描以对元素进行独占扫描。

我使用this file作为编写此代码的参考。我不明白我的代码中的错误是什么。任何帮助将不胜感激。

blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>>

意味着然后是内核0 < thid < int(ARRAY_SIZE/2)

但是,您的内核需要至少(2 * int(ARRAY_SIZE/2)) + 1个可用共享内存字才能正常工作,否则:

temp[2*thid+1] = d_in[2*thid+1];


如果我的整数数学技能不太生疏,这应该意味着如果ARRAY_SIZE是奇数,代码将是安全的,因为ARRAY_SIZE == (2 * int(ARRAY_SIZE/2)) + 1代表任何奇数。但是,如果ARRAY_SIZE是偶数,那么ARRAY_SIZE < (2 * int(ARRAY_SIZE/2)) + 1就会出现问题。


我无法评论内核的其余部分是否正确,但使用共享内存大小sizeof(int) * size_t(1 + ARRAY_SIZE)会使这个特定问题消失。