启动后,CUDA返回错误代码77

时间:2017-11-25 14:50:24

标签: cuda

我有下一个结构:

typedef struct
{
    float* coordinates;
} Point;

CUDA中的下一个功能:

cudaError_t calculateCenterUsingCuda(Point* point, const int NUM_OF_DIMENSIONS, const int NUM_OF_POINTS)
{
    Point* point_dev;

    cudaError_t cudaStatus;

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

    cudaStatus = cudaMalloc((void**)&point_dev, 1 * sizeof(Point));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaMalloc((void**)&point_dev->coordinates, NUM_OF_DIMENSIONS * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaMemcpy(point_dev, point, 1 * sizeof(Point), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaMemcpy(point_dev->coordinates, point->coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    calculateCenter<<<1, 52>>>(point_dev, NUM_OF_POINTS);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "calculateCenterlaunch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching calculateCenter!\n", cudaStatus);
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(point, point_dev, 1 * sizeof(Point), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaMemcpy(point->coordinates, point_dev->coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
    }

    return cudaStatus;
}

calculateCenter是:

__global__ void calculateCenter(Point* point, const int NUM_OF_POINTS)
{
    int i = threadIdx.x;
    point->coordinates[i] = point->coordinates[i] / NUM_OF_POINTS;
    printf("%d\n", i);
}

基本上我使用CUDA我创建一个平均点,将点中的每个坐标除以点数(默认为4) 默认维数为52。

但是当我运行此代码时出现错误:

cudaDeviceSynchronize returned error code 77 after launching calculateCenter!

为什么会发生这种情况有什么帮助?

谢谢!

1 个答案:

答案 0 :(得分:1)

您的代码在指针管理方式上存在一些问题。这条线

cudaStatus = cudaMalloc((void**)&point_dev->coordinates, NUM_OF_DIMENSIONS * sizeof(float));

错了。在此阶段,point_dev是指向设备内存的指针,主机API函数调用cudaMalloc期望主机内存空间中的目标地址。设备指针的存储位置是您的情况下的设备指针,这是错误的。

您可以通过使用中间Point数据结构并将设备指针存储在其中然后复制到设备来解决此问题:

cudaError_t calculateCenterUsingCuda(Point* point, const int NUM_OF_DIMENSIONS, const int NUM_OF_POINTS)
{
    Point* point_dev;

    Point point_for_dev;

    cudaError_t cudaStatus;

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

    cudaStatus = cudaMalloc((void**)&point_dev, 1 * sizeof(Point));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaMalloc((void**)&(point_for_dev.coordinates), NUM_OF_DIMENSIONS * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    cudaStatus = cudaMemcpy(point_dev, &point_for_dev, 1 * sizeof(Point), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaMemcpy(point_for_dev.coordinates, point->coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    calculateCenter <<< 1, NUM_OF_DIMENSIONS >>>(point_dev, NUM_OF_POINTS);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "calculateCenterlaunch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching calculateCenter!\n", cudaStatus);
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(point->coordinates, point_for_dev.coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
    }

    return cudaStatus;
}

这样,主机API方法一直使用主机指针。

基于指针的结构的深层复制需要特殊处理。