我有下一个结构:
typedef struct
{
float* coordinates;
} Point;
CUDA中的下一个功能:
cudaError_t calculateCenterUsingCuda(Point* point, const int NUM_OF_DIMENSIONS, const int NUM_OF_POINTS)
{
Point* point_dev;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
cudaStatus = cudaMalloc((void**)&point_dev, 1 * sizeof(Point));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&point_dev->coordinates, NUM_OF_DIMENSIONS * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(point_dev, point, 1 * sizeof(Point), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMemcpy(point_dev->coordinates, point->coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
calculateCenter<<<1, 52>>>(point_dev, NUM_OF_POINTS);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "calculateCenterlaunch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching calculateCenter!\n", cudaStatus);
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(point, point_dev, 1 * sizeof(Point), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMemcpy(point->coordinates, point_dev->coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
}
return cudaStatus;
}
而calculateCenter
是:
__global__ void calculateCenter(Point* point, const int NUM_OF_POINTS)
{
int i = threadIdx.x;
point->coordinates[i] = point->coordinates[i] / NUM_OF_POINTS;
printf("%d\n", i);
}
基本上我使用CUDA我创建一个平均点,将点中的每个坐标除以点数(默认为4) 默认维数为52。
但是当我运行此代码时出现错误:
cudaDeviceSynchronize returned error code 77 after launching calculateCenter!
为什么会发生这种情况有什么帮助?
谢谢!
答案 0 :(得分:1)
您的代码在指针管理方式上存在一些问题。这条线
cudaStatus = cudaMalloc((void**)&point_dev->coordinates, NUM_OF_DIMENSIONS * sizeof(float));
错了。在此阶段,point_dev
是指向设备内存的指针,主机API函数调用cudaMalloc
期望主机内存空间中的目标地址。设备指针的存储位置是您的情况下的设备指针,这是错误的。
您可以通过使用中间Point
数据结构并将设备指针存储在其中然后复制到设备来解决此问题:
cudaError_t calculateCenterUsingCuda(Point* point, const int NUM_OF_DIMENSIONS, const int NUM_OF_POINTS)
{
Point* point_dev;
Point point_for_dev;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
cudaStatus = cudaMalloc((void**)&point_dev, 1 * sizeof(Point));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMalloc((void**)&(point_for_dev.coordinates), NUM_OF_DIMENSIONS * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(point_dev, &point_for_dev, 1 * sizeof(Point), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaMemcpy(point_for_dev.coordinates, point->coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
calculateCenter <<< 1, NUM_OF_DIMENSIONS >>>(point_dev, NUM_OF_POINTS);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "calculateCenterlaunch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching calculateCenter!\n", cudaStatus);
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(point->coordinates, point_for_dev.coordinates, NUM_OF_DIMENSIONS * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
}
return cudaStatus;
}
这样,主机API方法一直使用主机指针。
基于指针的结构的深层复制需要特殊处理。