Question

我的cuda代码错误未指定启动失败与cudaMemory.I使用我自己的计算机与系统win10和cuda 8.0运行我的代码。我检查了我的代码多次，我没有发现问题。我做不要以为我有内存交叉的问题，我已经检查过我的内核功能没问题。你能帮我解决一下我的代码吗？这是我的代码：

__global__ void Add(float* a,float* dist)
{
    int i = blockIdx.x;
    int j = threadIdx.x;
    float sum = 0;
    dist[i * 1024 + j] = 0.0;
    for (int k = 0; k < 10240; k++)
    {
         sum += (a[i * 10240 + k] - a[j * 10240 + k])*(a[i * 10240 + k] - a[j * 10240 + k]);
    }
    dist[i * 1024 + j] = sum;
}

int main()
{
    float* a, *distance;
    a = (float*)malloc(sizeof(float) * 1024 * 10240);
    distance = (float*)malloc(sizeof(float) * 1024 * 1024);
    if (a == NULL || distance == NULL)  printf("error\n");
    cudaError_t cudaStatus;
    for (int i = 0; i < 1024; i++)
    {
        for (int j = 0; j < 10240; j++)
        {
            a[i * 10240 + j] = i + 1.0 / 100 * j;
        }
    }
    for (int i = 0; i < 1024; i++)
    {
        for (int j = 0; j < 1024; j++)
        {
            distance[i * 1024 + j] = 0.0;
        }  
    }
    float* dev_a,* dev_distance;
    cudaMalloc((void**)&dev_a, 1024 * 10240*sizeof(float));
    cudaMalloc((void**)&dev_distance, 1024 * 1024 * sizeof(float));
    cudaMemcpy(dev_a, a, 1024 * 10240 * sizeof(float), cudaMemcpyHostToDevice);
    unsigned int start = clock();
    Add <<<1024,1024 >>>(dev_a,dev_distance);
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        printf( "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
     }
    cudaStatus = cudaMemcpy(distance, dev_distance, 1024 * 1024 * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        printf("cudaMemcpy: %s\n", cudaGetErrorString(cudaStatus));
    }
    unsigned int last = clock() - start;
    cudaFree(dev_a);
    printf("%u", last);
    return 0;
}

cuda：cudaMemory时未指定的发射失败

0 个答案: