cudaMemcpyDeviceToHost()失败

时间:2015-04-30 21:51:35

标签: c++ c cuda gpu

我有以下代码(假设所有内容都已正确定义):

#include "OurIncludes.h"
#include <ctime>

__global__ void kernel_testing(int *d_intersects, Circle *part1, Circle *part2)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i < 10 && j < 10) {
        int index = i + j * 10;
        d_intersects[index] = part1[i].intersect(part2[j]);
    }
}


int main(void)
{
    dim3 GRID(1, 1);
    dim3 BLOCK(10, 10);

    short randomNum;
    RandObj randGenerator;
    Circle* obj = new Circle[10];
    Circle* obj2 = new Circle[10];
    Circle *d_obj;
    Circle *d_obj2;
    int intersects[100];
    int *d_intersects;

    if (cudaSuccess != cudaMalloc((void **)&d_obj, sizeof(Circle) * 10)) {
        fprintf(stderr, "Failed to allocate memory for d_result\n");
    }
    if (cudaSuccess != cudaMalloc((void **)&d_obj2, sizeof(Circle) * 10)) {
        fprintf(stderr, "Failed to allocate memory for d_result\n");
    }
    if (cudaSuccess != cudaMalloc((void **)&d_intersects, sizeof(int) * 100)) {
        fprintf(stderr, "Failed to allocate memory for d_result\n");
    }
    for (int i = 0; i < 10; i++) {
        obj[i] = (*randGenerator.makeRandomCircle());
    }

    for (int i = 0; i < 10; i++) {
        obj2[i] = (*randGenerator.makeRandomCircle());
    }
    size_t size = sizeof(Circle);
    if (cudaSuccess != cudaMemcpy(d_obj, obj, size * 10, cudaMemcpyHostToDevice)) {
        fprintf(stderr, "Failed to copy data to d_obj\n");
    }
    if (cudaSuccess != cudaMemcpy(d_obj2, obj2, size * 10, cudaMemcpyHostToDevice)) {
        fprintf(stderr, "Failed to copy data to d_obj2\n");
    }

    kernel_testing << < GRID, BLOCK >> >(d_intersects, d_obj, d_obj2);


    cudaError_t s = cudaMemcpy(intersects, d_intersects, sizeof(int) * 100, cudaMemcpyDeviceToHost);
    fprintf(stderr, "Error is: %s", cudaGetErrorString(s));
    cudaFree(d_intersects);
    cudaFree(d_obj);
    cudaFree(d_obj2);
    return 0;   
}

出于某种原因,代码始终在cudaMemcpyDeviceToHost处失败,我无法找到原因。我尝试使用不同的对象(三角形,球体等)进行启动,但是当我需要将数据从设备复制回主机时,它总是会失败。感谢任何帮助和/或建议,我对使用CUDA进行编程非常陌生。感谢。

编辑:错误代码表示遇到了非法内存访问,但我不知道为什么会发生这种情况。

编辑2 :所以我已经删除了所有的双指针并且&#34;扁平化了#34;我的阵列,但我仍然有同样的问题。我现在完全没有想法了。

1 个答案:

答案 0 :(得分:-2)

我想我曾经遇到过这样的问题,我的解决方案是:

cudaError_t status = cudaMemcpy(devPtr, srcPtr, size * sizeof(int), cudaMemcpyHostToDevice);
if (status == cudaSuccess) { ... }

尽量不要直接比较功能和cudaSuccess,而是通过变量。