我目前正在为深度学习(铅笔和纸技术)做一些Cuda C ++的东西,但我仍然坚持Cuda的一些奇怪的行为。
这是我的班级:
class Matrix
{
public:
float* data;
int width;
int height;
Matrix();
Matrix(const Matrix&);
~Matrix();
void reset();
friend std::ostream& operator<<(std::ostream&, const Matrix*);
};
其定义:
Matrix::Matrix()
{
}
Matrix::Matrix(const Matrix& copy) : width(copy.width), height(copy.height)
{
data = new float[width * height];
std::copy(copy.data, copy.data + width * height, data);
}
Matrix::~Matrix()
{
delete data;
}
void Matrix::reset()
{
memset(data, 0, width * height * sizeof(float));
}
std::ostream& operator<<(std::ostream& out, const Matrix* matrix)
{
for (int i = 1; i <= matrix->height * matrix->width; ++i)
out << matrix->data[i - 1] << (i % matrix->width == 0 ? "\n" : "\t");
return out;
}
这是一个最小,完整和可验证的例子:
__global__ void add_and_display(Matrix* dev_weights)
{
dev_weights->data[blockIdx.x * dev_weights->width + threadIdx.x] += 1.f;
}
int main()
{
Matrix* weights = new Matrix(), *dev_weights;
float* weights_elements;
//For the purpose of testing, creating a checked pattern Matrix
weights->width = 9;
weights->height = 9;
weights->data = new float[weights->width * weights->height];
for (int i = 0; i < weights->width * weights->height; ++i)
{
if (i % 2 == 0)
weights->data[i] = 0;
else
weights->data[i] = 1;
}
int weights_size = weights->width * weights->height * sizeof(float);
HANDLE_ERROR(cudaMalloc((void **)&weights_elements, weights_size));
//Allocate objects on the device
HANDLE_ERROR(cudaMalloc((void **)&dev_weights, sizeof(Matrix)));
//Copy the data to the object allocated on the device
HANDLE_ERROR(cudaMemcpy(dev_weights, weights, sizeof(Matrix), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(weights_elements, weights->data, weights_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(&(dev_weights->data), &weights_elements, sizeof(float*), cudaMemcpyHostToDevice));
add_and_display <<< weights->width, weights->height >>> (dev_weights);
HANDLE_ERROR(cudaDeviceSynchronize());
//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
//Display and get errors here
cout << weights << endl;
cudaFree(dev_weights);
return 0;
}
这是我的错误检查宏:
static void HandleError(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
因此问题是当您尝试在主机上显示数据时程序崩溃。我想从设备返回的副本不起作用,但我找不到纠正它的方法。
谢谢你能帮助我找到问题。
编辑1:简化了我的帖子,所以每个人都可以测试它。
答案 0 :(得分:0)
在设备上分配,传输和使用对象实际上没有任何问题。在MCVE结束时从设备传回数据有一个小错误,它是段错误的来源。这样:
//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
向weights
留下一个指向基础权重数据的设备指针,当您尝试在主机上输出矩阵内容时会导致段错误。代码需要更改为:
//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
weights->data = hostPointer; // weights data must point to hostPointer
然后代码正确运行:
$ cat weights.cu
#include <iostream>
#include <cstdio>
static void HandleError(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
class Matrix
{
public:
float* data;
int width;
int height;
Matrix();
Matrix(const Matrix&);
~Matrix();
void reset();
friend std::ostream& operator<<(std::ostream&, const Matrix*);
};
Matrix::Matrix()
{
}
Matrix::Matrix(const Matrix& copy) : width(copy.width), height(copy.height)
{
data = new float[width * height];
std::copy(copy.data, copy.data + width * height, data);
}
Matrix::~Matrix()
{
delete data;
}
void Matrix::reset()
{
memset(data, 0, width * height * sizeof(float));
}
std::ostream& operator<<(std::ostream& out, const Matrix* matrix)
{
for (int i = 1; i <= matrix->height * matrix->width; ++i)
out << matrix->data[i - 1] << (i % matrix->width == 0 ? "\n" : "\t");
return out;
}
__global__ void add_and_display(Matrix* dev_weights)
{
dev_weights->data[blockIdx.x * dev_weights->width + threadIdx.x] += 1.f;
}
int main()
{
Matrix* weights = new Matrix(), *dev_weights;
float* weights_elements;
//For the purpose of testing, creating a checked pattern Matrix
weights->width = 9;
weights->height = 9;
weights->data = new float[weights->width * weights->height];
for (int i = 0; i < weights->width * weights->height; ++i)
{
if (i % 2 == 0)
weights->data[i] = 0;
else
weights->data[i] = 1;
}
int weights_size = weights->width * weights->height * sizeof(float);
HANDLE_ERROR(cudaMalloc((void **)&weights_elements, weights_size));
//Allocate objects on the device
HANDLE_ERROR(cudaMalloc((void **)&dev_weights, sizeof(Matrix)));
//Copy the data to the object allocated on the device
HANDLE_ERROR(cudaMemcpy(dev_weights, weights, sizeof(Matrix), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(weights_elements, weights->data, weights_size, cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(&(dev_weights->data), &weights_elements, sizeof(float*), cudaMemcpyHostToDevice));
add_and_display <<< weights->width, weights->height >>> (dev_weights);
HANDLE_ERROR(cudaDeviceSynchronize());
//Copy back data from device
float* hostPointer = new float[weights->width * weights->height];
HANDLE_ERROR(cudaMemcpy(weights, dev_weights, sizeof(Matrix), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(hostPointer, weights->data, weights_size, cudaMemcpyDeviceToHost));
weights->data = hostPointer;
//Display and get errors here
std::cout << weights << std::endl;
cudaFree(dev_weights);
return 0;
}
$ nvcc -g -arch=sm_52 -o weights weights.cu
$ cuda-memcheck weights
========= CUDA-MEMCHECK
1 2 1 2 1 2 1 2 1
2 1 2 1 2 1 2 1 2
1 2 1 2 1 2 1 2 1
2 1 2 1 2 1 2 1 2
1 2 1 2 1 2 1 2 1
2 1 2 1 2 1 2 1 2
1 2 1 2 1 2 1 2 1
2 1 2 1 2 1 2 1 2
1 2 1 2 1 2 1 2 1
========= ERROR SUMMARY: 0 errors