我是CUDA的新手。我必须在项目中使用cudaLaunchCooperativeKernel(),但是代码给出此错误:CUDA错误:不支持操作。例如,此代码可以正常工作
__global__ void add(int *a, int *b) {
int i = (blockIdx.x * blockDim.x + threadIdx.x);
if (i < N) {
a[i] = a[i] + b[i];}
}
int main()
{
int a[N], b[N];
int *dev_a, *dev_b;
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
for (int i = 0; i < N; i++) {
a[i] = rand()%100;
b[i] = rand()%100;
}
for (int i = 0; i < N; i++) {
cout << " " << setfill('0') << setw(3) << a[i];
}
cout << endl;
for (int i = 0; i < N; i++) {
cout << " " << setfill('0') << setw(3) << b[i];
}
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
add << < (N / 256) + 1, 256 >> > (dev_a, dev_b);
//cudaLaunchCooperativeKernel((void*)add, (N / 256) + 1, 256, (void**)(&dev_a, &dev_b));
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
cout << endl;
for (int i = 0; i < N; i++) {
cout << " " << setfill('0') << setw(3) << a[i];
}
cudaFree(dev_a);
cudaFree(dev_b);
cout << endl;
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA Error: %s\n",
cudaGetErrorString(error));
return 1;
}
}
但是,如果我更改此设置,则不起作用
add << < (N / 256) + 1, 256 >> > (dev_a, dev_b);
与此
cudaLaunchCooperativeKernel((void*)add, (N / 256) + 1, 256, (void**)(&dev_a, &dev_b));
我使用CUDA 10,并且拥有具有Pascal架构的GTX 1050图形卡。