将阵列从RAM复制到GPU以及从GPU复制到RAM

时间:2012-04-16 16:33:54

标签: c++ cuda gpgpu

我正在尝试在我的一个项目中引入一些CUDA优化。但我想我在这里做错了什么。我想实现一个简单的矩阵向量乘法(result = matrix * vector)。但是当我想将结果复制回主机时,会发生错误(cudaErrorLaunchFailure)。我的内核(matrixVectorMultiplicationKernel)中是否有错误,或者我是否错误地调用了cudaMemcpy?我没有找到有关此类错误状态的有用文档。我认为这完全破坏了GPU的状态,因为我不能在第一次出现之后再调用任何CUDA内核而不会再次出现此错误。

编辑#1:根据左撇子的建议更新了代码。

// code
...
Eigen::MatrixXf matrix(M, N); // matrix.data() usually should return a float array
Eigen::VectorXf vector(N);    // same here for vector.data()
Eigen::VectorXf result(M);
... // fill matrix and vector
float* matrixOnDevice = copyMatrixToDevice(matrix.data(), matrix.rows(), matrix.cols());
matrixVectorMultiplication(matrixOnDevice, vector.data(), result.data(), matrix.rows(), cm.cols());
... // clean up

// helper functions
float* copyMatrixToDevice(const float* matrix, int mRows, int mCols)
{
  float* matrixOnDevice;
  const int length = mRows*mCols;
  const int size = length * sizeof(float);
  handleCUDAError(cudaMalloc((void**)&matrixOnDevice, size));
  handleCUDAError(cudaMemcpy(matrixOnDevice, matrix, size, cudaMemcpyHostToDevice));
  return matrixOnDevice;
}

void matrixVectorMultiplication(const float* matrixOnDevice, const float* vector, float* result, int mRows, int mCols)
{
  const int vectorSize = mCols*sizeof(float);
  const int resultSize = mRows*sizeof(float);
  const int matrixLength = mRows*mCols;
  float* deviceVector;
  float* deviceResult;
  handleCUDAError(cudaMalloc((void**)&deviceVector, vectorSize));
  handleCUDAError(cudaMalloc((void**)&deviceResult, resultSize));
  handleCUDAError(cudaMemset(deviceResult, 0, resultSize));
  handleCUDAError(cudaMemcpy(deviceVector, vector, vectorSize, cudaMemcpyHostToDevice));
  int threadsPerBlock = 256;
  int blocksPerGrid = (mRows + threadsPerBlock - 1) / threadsPerBlock;
  matrixVectorMultiplicationKernel<<<blocksPerGrid, threadsPerBlock>>>(matrixOnDevice, vector, result, mRows, mCols, matrixLength);
  // --- no errors yet ---
  handleCUDAError(cudaMemcpy(result, deviceResult, resultSize, cudaMemcpyDeviceToHost)); // cudaErrorLaunchFailure
  handleCUDAError(cudaFree(deviceVector)); // cudaErrorLaunchFailure
  handleCUDAError(cudaFree(deviceResult)); // cudaErrorLaunchFailure
}

__global__ void matrixVectorMultiplicationKernel(const float* matrix, const float* vector, float* result, int mRows, int mCols, int length)
{
  int row = blockDim.x * blockIdx.x + threadIdx.x;
  if(row < mRows)
  {
    for(int col = 0, mIdx = row*mCols; col < mCols; col++, mIdx++)
      result[row] += matrix[mIdx] * vector[col];
  }
}

1 个答案:

答案 0 :(得分:3)

你的问题是void copyMatrixToDevice(..., float* matrixOnDevice, ...)通过值获取此指针,即它不能“输出”设备矩阵。您可以使用{/ 1}调用

来执行此操作
void copyMatrixToDevice(..., float** matrixOnDevice, ...)

copyMatrixToDevice(matrix.data(), &matrixOnDevice, matrix.rows(), matrix.cols()); 中的result存在同样的问题。

从长远来看,在C ++中你应该为所有这些设置适当的类抽象层。