以下是GPU内核片段:
__global_ void POCKernel(int *a)
{
int i = threadIdx.x;
a[i] = a[i] + 1;
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
//double* dblNewMemoryVarA = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarA, 0, 15 * sizeof(double));
//double* dblNewMemoryVarB = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarB, 0, 15 * sizeof(double));
//double* dblNewMemoryVarC = (double*)malloc(15 * sizeof(double));
////memset(dblNewMemoryVarC, 0, 15 * sizeof(double));
for (int j = 0; j < 15; j++)
{
dblNewMemoryVarA[j] = 0;
dblNewMemoryVarB[j] = 0;
dblNewMemoryVarC[j] = 0;
}
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] * dblNewMemoryVarB[i];
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] - dblNewMemoryVarB[i];
/*free(dblNewMemoryVarA);
free(dblNewMemoryVarB);
free(dblNewMemoryVarC);*/
}
}
这个内核的调用函数是:
int main()
{
const int arraySize = 1024 * 1024;
int* a = new int[arraySize];
int *dev_a = 0;
for (int i = 0; i < arraySize; i++)
{
a[i] = 5;
}
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "CUDA failed!");
return 1;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_a, arraySize * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
POCKernel << <4096, 256 >> >(dev_a);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
Error:
cudaFree(dev_a);
return 0;
}
}
在cudaDeviceSynchronize上,错误代码是4 - 未指定的启动失败。 有人可以告诉我为什么我要面对这个问题吗?
答案 0 :(得分:2)
这段代码在很多方面都很奇怪,但让我们说清楚。这些内核代码行中存在一个明确的问题:
int i = threadIdx.x;
...
if (i < 1024 * 1024)
{
double dblNewMemoryVarA[15];
double dblNewMemoryVarB[15];
double dblNewMemoryVarC[15];
...
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
您正在启动每个256个线程的线程块:
POCKernel << <4096, 256 >> >(dev_a);
^^^
这意味着您的threadIdx.x
变量在块中的所有线程中的范围为0到255:
int i = threadIdx.x;
在您的本地变量中,您已为15个数量分配空间:
double dblNewMemoryVarA[15];
但是您尝试使用i
索引这些数组,如前所述,这些数组的范围最高为255:
dblNewMemoryVarC[i] = dblNewMemoryVarA[i] + dblNewMemoryVarB[i];
因此会生成越界索引,这很可能会导致内核启动失败。
我们无法肯定地说,因为您还没有提供完整的代码,也没有说明您的编译方式或运行的环境。但上述情况肯定是非法的从代码正确性的角度来看。
我的猜测是你在调试模式下编译(-G
)。如果没有,我希望编译器在if测试之后优化所有内容,因为这些代码都不会影响任何全局状态。
正如评论中所指出的那样,如果你正在运行这个窗口,可能只是因为你遇到了Windows WDDM超时。