Question

我目前正在为深度学习（铅笔和纸技术）做一些Cuda C ++的东西，但我仍然坚持Cuda的一些奇怪的行为。

这是我的班级：

class Matrix
{
public:
    float* data;
    int width;
    int height;

    Matrix();
    Matrix(const Matrix&);
    ~Matrix();
    void reset();
    friend std::ostream& operator<<(std::ostream&, const Matrix*);
};

其定义：

Matrix::Matrix()
{
}

Matrix::Matrix(const Matrix& copy) : width(copy.width), height(copy.height)
{
    data = new float[width * height];
    std::copy(copy.data, copy.data + width * height, data);
}

Matrix::~Matrix()
{
    delete data;
}

void Matrix::reset()
{
    memset(data, 0, width * height * sizeof(float));
}

std::ostream& operator<<(std::ostream& out, const Matrix* matrix)
{   
    for (int i = 1; i <= matrix->height * matrix->width; ++i)
        out << matrix->data[i - 1] << (i % matrix->width == 0 ? "\n" : "\t");
    return out;
}

这是一个最小，完整和可验证的例子：

__global__ void add_and_display(Matrix* dev_weights)
{
    dev_weights->data[blockIdx.x * dev_weights->width + threadIdx.x] += 1.f;
}

int main()
{
    Matrix* weights = new Matrix(), *dev_weights;
    float* weights_elements;

    //For the purpose of testing, creating a checked pattern Matrix
    weights->width = 9;
    weights->height = 9;
    weights->data = new float[weights->width * weights->height];
    for (int i = 0; i < weights->width * weights->height; ++i)
    {
        if (i % 2 == 0) 
            weights->data[i] = 0;
        else 
            weights->data[i] = 1;
    }

    int weights_size = weights->width * weights->height * sizeof(float);

    HANDLE_ERROR(cudaMalloc((void **)&weights_elements, weights_size));

    //Allocate objects on the device
    HANDLE_ERROR(cudaMalloc((void **)&dev_weights, sizeof(Matrix)));

    //Copy the data to the object allocated on the device
    HANDLE_ERROR(cudaMemcpy(dev_weights, weights, sizeof(Matrix), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(weights_elements, weights->data, weights_size, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(&(dev_weights->data), &weights_elements, sizeof(float*), cudaMemcpyHostToDevice));

    add_and_display <<< weights->width, weights->height >>> (dev_weights);

    HANDLE_ERROR(cudaDeviceSynchronize());

    //Copy back data from device
    float* hostPointer = new float[weights->width * weights->height];
    HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));

    //Display and get errors here
    cout << weights << endl;

    cudaFree(dev_weights);

    return 0;
}

这是我的错误检查宏：

static void HandleError(cudaError_t err, const char *file, int line) {
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

因此问题是当您尝试在主机上显示数据时程序崩溃。我想从设备返回的副本不起作用，但我找不到纠正它的方法。

谢谢你能帮助我找到问题。

编辑1：简化了我的帖子，所以每个人都可以测试它。

Answer 1

在设备上分配，传输和使用对象实际上没有任何问题。在MCVE结束时从设备传回数据有一个小错误，它是段错误的来源。这样：

//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));

向weights留下一个指向基础权重数据的设备指针，当您尝试在主机上输出矩阵内容时会导致段错误。代码需要更改为：

//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
weights->data = hostPointer; // weights data must point to hostPointer

然后代码正确运行：

$ cat weights.cu
#include <iostream>
#include <cstdio>

static void HandleError(cudaError_t err, const char *file, int line) {
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
class Matrix
{
public:
    float* data;
    int width;
    int height;

    Matrix();
    Matrix(const Matrix&);
    ~Matrix();
    void reset();
    friend std::ostream& operator<<(std::ostream&, const Matrix*);
};

Matrix::Matrix()
{
}

Matrix::Matrix(const Matrix& copy) : width(copy.width), height(copy.height)
{
    data = new float[width * height];
    std::copy(copy.data, copy.data + width * height, data);
}

Matrix::~Matrix()
{
    delete data;
}

void Matrix::reset()
{
    memset(data, 0, width * height * sizeof(float));
}

std::ostream& operator<<(std::ostream& out, const Matrix* matrix)
{   
    for (int i = 1; i <= matrix->height * matrix->width; ++i)
        out << matrix->data[i - 1] << (i % matrix->width == 0 ? "\n" : "\t");
    return out;
}

__global__ void add_and_display(Matrix* dev_weights)
{
    dev_weights->data[blockIdx.x * dev_weights->width + threadIdx.x] += 1.f;
}

int main()
{
    Matrix* weights = new Matrix(), *dev_weights;
    float* weights_elements;

    //For the purpose of testing, creating a checked pattern Matrix
    weights->width = 9;
    weights->height = 9;
    weights->data = new float[weights->width * weights->height];
    for (int i = 0; i < weights->width * weights->height; ++i)
    {
        if (i % 2 == 0) 
            weights->data[i] = 0;
        else 
            weights->data[i] = 1;
    }

    int weights_size = weights->width * weights->height * sizeof(float);

    HANDLE_ERROR(cudaMalloc((void **)&weights_elements, weights_size));

    //Allocate objects on the device
    HANDLE_ERROR(cudaMalloc((void **)&dev_weights, sizeof(Matrix)));

    //Copy the data to the object allocated on the device
    HANDLE_ERROR(cudaMemcpy(dev_weights, weights, sizeof(Matrix), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(weights_elements, weights->data, weights_size, cudaMemcpyHostToDevice));
    HANDLE_ERROR(cudaMemcpy(&(dev_weights->data), &weights_elements, sizeof(float*), cudaMemcpyHostToDevice));

    add_and_display <<< weights->width, weights->height >>> (dev_weights);

    HANDLE_ERROR(cudaDeviceSynchronize());

    //Copy back data from device
    float* hostPointer = new float[weights->width * weights->height];
    HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
    HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
    weights->data = hostPointer;

    //Display and get errors here
    std::cout << weights << std::endl;

    cudaFree(dev_weights);

    return 0;
}

$ nvcc -g -arch=sm_52 -o weights weights.cu 

$ cuda-memcheck weights
========= CUDA-MEMCHECK
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1
2   1   2   1   2   1   2   1   2
1   2   1   2   1   2   1   2   1

========= ERROR SUMMARY: 0 errors

在Cuda中分配对象的麻烦

1 个答案: