Question

我正在尝试利用oder中的gpu台机器来提高矩阵乘法运算的性能。

我尝试理解this post并使用this repos中的cuda代码，并使用R在package devtools中构建所有代码

我所做的是写一个名为cuda的{{1}}文件：

matrixMultiplication.cu

然后是名为#include <stdio.h> #include <stdlib.h> #include <assert.h> #define BLOCK_SIZE 16 __global__ void runGpuMatrixMult(double *a, double *b, double *c, int m, int n, int k) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; int sum = 0; if( col < k && row < m) { for(int i = 0; i < n; i++) { sum += a[row * n + i] * b[i * k + col]; } c[row * k + col] = sum; } } extern "C" void gpuMatrixMult(double &A, double &B, double &C, int& m, int& n, int& k) { // allocate memory in host RAM double *h_A, *h_B, *h_C; cudaMallocHost((void **) &h_A, sizeof(int)*m*n); cudaMallocHost((void **) &h_B, sizeof(int)*n*k); cudaMallocHost((void **) &h_C, sizeof(int)*m*k); // Allocate memory space on the device int *d_A, *d_B, *d_C; cudaMalloc((void **) &d_A, sizeof(int)*m*n); cudaMalloc((void **) &d_B, sizeof(int)*n*k); cudaMalloc((void **) &d_C, sizeof(int)*m*k); // copy matrix A and B from host to device memory cudaMemcpy(d_A, h_A, sizeof(int)*m*n, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, sizeof(int)*n*k, cudaMemcpyHostToDevice); unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE; unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE; dim3 dimGrid(grid_cols, grid_rows); dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // Launch kernel runGpuMatrixMult<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, m, n, k); // Transfer results from device to host cudaMemcpy(h_C, d_C, sizeof(int)*m*k, cudaMemcpyDeviceToHost); cudaThreadSynchronize(); // free memory cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFreeHost(h_A); cudaFreeHost(h_B); cudaFreeHost(h_C); return 0; }的{{1}}文件：

cpp

最后，我有一个名为matrixUtils.cpp的R文件，其中有一个调用// [[Rcpp::depends(RcppArmadillo)]] #include <RcppArmadillo.h> using namespace Rcpp; extern "C" void gpuMatrixMult(double const&A, double const&B, double const& C, int& m, int& n, int& k); //' gpuMatrixMultCaller calls matrixMultiplication.cu::gpuMatrixMult //' //' @export //[[Rcpp::export]] SEXP gpuMatrixMultCaller(double const& A, double const& B, double& C, int m, int n, int k) { gpuMatrixMult(A, B, C, m, n, k); return R_NilValue; }的包装器utils.R：

function

当我运行gpuMatrixMultCaller时，我收到此错误：

#' gpuMatrixMultWrapper calls matrixUtils.cpp::gpuMatrixMultCaller which runs a GPU matrix multiplication
#' Returns the product of the input matrices
gpuMatrixMultWrapper <- function(A,B)
{
  m <- nrow(A)
  n <- ncol(A)
  k <- ncol(B)
  C <- bigmemory::deepcopy(A)
  gpuMatrixMultCaller(A, B, C, m, n, k)
  return(C)
}

devtools::document文件确实有：Error in dyn.load(dllfile) : unable to load shared object '/home/code/packages/utils/src/utils.so': /home/code/packages/utils/src/utils.so: undefined symbol: gpuMatrixMult位于底线和我指定的NAMESPACE文件中：useDynLib(utils)

所以我的问题是：

甚至可以构建导入DESCRIPTION代码的LinkingTo: Rcpp, RcppArmadillo R吗？使用pacakge？如果不是cuda部分只是在devtools文件中编码？
如果是这样，我错过了什么？我尝试在cuda中添加cpp，但得到了：#include <cuda.h>

非常感谢

使用devtools构建一个导入cuda代码的R包

0 个答案: