我正在尝试利用oder中的gpu
台机器来提高矩阵乘法运算的性能。
我尝试理解this post并使用this repos中的cuda
代码,并使用R
在package
devtools
中构建所有代码
我所做的是写一个名为cuda
的{{1}}文件:
matrixMultiplication.cu
然后是名为#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
__global__ void runGpuMatrixMult(double *a, double *b, double *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
extern "C"
void gpuMatrixMult(double &A, double &B, double &C, int& m, int& n, int& k) {
// allocate memory in host RAM
double *h_A, *h_B, *h_C;
cudaMallocHost((void **) &h_A, sizeof(int)*m*n);
cudaMallocHost((void **) &h_B, sizeof(int)*n*k);
cudaMallocHost((void **) &h_C, sizeof(int)*m*k);
// Allocate memory space on the device
int *d_A, *d_B, *d_C;
cudaMalloc((void **) &d_A, sizeof(int)*m*n);
cudaMalloc((void **) &d_B, sizeof(int)*n*k);
cudaMalloc((void **) &d_C, sizeof(int)*m*k);
// copy matrix A and B from host to device memory
cudaMemcpy(d_A, h_A, sizeof(int)*m*n, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(int)*n*k, cudaMemcpyHostToDevice);
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Launch kernel
runGpuMatrixMult<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, m, n, k);
// Transfer results from device to host
cudaMemcpy(h_C, d_C, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
// free memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
return 0;
}
的{{1}}文件:
cpp
最后,我有一个名为matrixUtils.cpp
的R文件,其中有一个调用// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
using namespace Rcpp;
extern "C"
void gpuMatrixMult(double const&A, double const&B, double const& C, int& m, int& n, int& k);
//' gpuMatrixMultCaller calls matrixMultiplication.cu::gpuMatrixMult
//'
//' @export
//[[Rcpp::export]]
SEXP gpuMatrixMultCaller(double const& A, double const& B, double& C, int m, int n, int k) {
gpuMatrixMult(A, B, C, m, n, k);
return R_NilValue;
}
的包装器utils.R
:
function
当我运行gpuMatrixMultCaller
时,我收到此错误:
#' gpuMatrixMultWrapper calls matrixUtils.cpp::gpuMatrixMultCaller which runs a GPU matrix multiplication
#' Returns the product of the input matrices
gpuMatrixMultWrapper <- function(A,B)
{
m <- nrow(A)
n <- ncol(A)
k <- ncol(B)
C <- bigmemory::deepcopy(A)
gpuMatrixMultCaller(A, B, C, m, n, k)
return(C)
}
devtools::document
文件确实有:Error in dyn.load(dllfile) :
unable to load shared object '/home/code/packages/utils/src/utils.so':
/home/code/packages/utils/src/utils.so: undefined symbol: gpuMatrixMult
位于底线和我指定的NAMESPACE
文件中:useDynLib(utils)
所以我的问题是:
甚至可以构建导入DESCRIPTION
代码的LinkingTo: Rcpp, RcppArmadillo
R
吗?使用pacakge
?如果不是cuda
部分只是在devtools
文件中编码?
如果是这样,我错过了什么?我尝试在cuda
中添加cpp
,但得到了:#include <cuda.h>
非常感谢