我的cuda代码错误未指定启动失败与cudaMemory.I使用我自己的计算机与系统win10和cuda 8.0运行我的代码。我检查了我的代码多次,我没有发现问题。我做不要以为我有内存交叉的问题,我已经检查过我的内核功能没问题。你能帮我解决一下我的代码吗? 这是我的代码:
__global__ void Add(float* a,float* dist)
{
int i = blockIdx.x;
int j = threadIdx.x;
float sum = 0;
dist[i * 1024 + j] = 0.0;
for (int k = 0; k < 10240; k++)
{
sum += (a[i * 10240 + k] - a[j * 10240 + k])*(a[i * 10240 + k] - a[j * 10240 + k]);
}
dist[i * 1024 + j] = sum;
}
int main()
{
float* a, *distance;
a = (float*)malloc(sizeof(float) * 1024 * 10240);
distance = (float*)malloc(sizeof(float) * 1024 * 1024);
if (a == NULL || distance == NULL) printf("error\n");
cudaError_t cudaStatus;
for (int i = 0; i < 1024; i++)
{
for (int j = 0; j < 10240; j++)
{
a[i * 10240 + j] = i + 1.0 / 100 * j;
}
}
for (int i = 0; i < 1024; i++)
{
for (int j = 0; j < 1024; j++)
{
distance[i * 1024 + j] = 0.0;
}
}
float* dev_a,* dev_distance;
cudaMalloc((void**)&dev_a, 1024 * 10240*sizeof(float));
cudaMalloc((void**)&dev_distance, 1024 * 1024 * sizeof(float));
cudaMemcpy(dev_a, a, 1024 * 10240 * sizeof(float), cudaMemcpyHostToDevice);
unsigned int start = clock();
Add <<<1024,1024 >>>(dev_a,dev_distance);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
printf( "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMemcpy(distance, dev_distance, 1024 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
printf("cudaMemcpy: %s\n", cudaGetErrorString(cudaStatus));
}
unsigned int last = clock() - start;
cudaFree(dev_a);
printf("%u", last);
return 0;
}