CUDA中的cudaLaunchCooperativeKernel()不支持该操作

时间:2018-12-31 08:42:54

标签: cuda

我是CUDA的新手。我必须在项目中使用cudaLaunchCooperativeKernel(),但是代码给出此错误:CUDA错误:不支持操作。例如,此代码可以正常工作

__global__ void add(int *a, int *b) {
int i = (blockIdx.x * blockDim.x + threadIdx.x);

if (i < N) {
    a[i] = a[i] + b[i];}    
}

int main()
{

int a[N], b[N];
int *dev_a, *dev_b;

cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));

for (int i = 0; i < N; i++) {   
    a[i] = rand()%100;
    b[i] = rand()%100;
}
for (int i = 0; i < N; i++) { 
    cout << " " << setfill('0') << setw(3) << a[i];
}

cout << endl;


for (int i = 0; i < N; i++) { 

    cout << " " << setfill('0') << setw(3) << b[i];

}

cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);


add << < (N / 256) + 1, 256 >> > (dev_a, dev_b);
//cudaLaunchCooperativeKernel((void*)add, (N / 256) + 1, 256, (void**)(&dev_a, &dev_b));

cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);

    cout << endl;
    for (int i = 0; i < N; i++) { 

        cout << " " << setfill('0') << setw(3) << a[i];
    }

cudaFree(dev_a);
cudaFree(dev_b);

cout << endl;
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
    printf("CUDA Error: %s\n", 
    cudaGetErrorString(error));
    return 1;
}
}

但是,如果我更改此设置,则不起作用

add << < (N / 256) + 1, 256 >> > (dev_a, dev_b);

与此

cudaLaunchCooperativeKernel((void*)add, (N / 256) + 1, 256, (void**)(&dev_a, &dev_b));

我使用CUDA 10,并且拥有具有Pascal架构的GTX 1050图形卡。

0 个答案:

没有答案