当尝试使用cublasSgemm
例程执行张量矩阵产品时,会发生地址超出范围错误,下面提供了一个示例: -
========= Invalid __global__ read of size 4
========= at 0x000019f8 in sgemm_sm35_ldg_nn_64x16x64x16x16
========= by thread (6,3,0) in block (6,3,0)
========= Address 0x7ffc059064a8 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x21fb31]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x23a343]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d4e92]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d17b4]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d2c5e]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0x1d37b2]
========= Host Frame:/usr/local/cuda-7.5/lib64/libcublas.so.7.5 [0xecd31]
========= Host Frame:./test [0x2c0e]
========= Host Frame:./test [0x2a99]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21af5]
========= Host Frame:./test [0x2749]
在我的应用程序中多次检查维度并确定这不是问题之后,我写了一个最小的工作示例。下面是一个简单的例子,它将两个平方矩阵相乘: -
#include "stdlib.h"
#include "time.h"
#include "stdio.h"
#include "cuda.h"
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <math.h>
#include "cuda_error.h"
void matrixMult(cublasOperation_t transA, cublasOperation_t transB, int M, int N,
int K, float alpha, float *A, float *B, float beta, float *C,
cublasHandle_t *cb_handle);
int main(){
int i, j, idx;
int D = 500;
int len = D*D;
float *A_h, *B_h, *C_h;
float *A_d, *B_d, *C_d;
A_h = (float*)malloc(len*sizeof(float));
B_h = (float*)malloc(len*sizeof(float));
C_h = (float*)malloc(len*sizeof(float));
srand48(time(NULL));
for(i=0; i<D; i++){
for(j=0; j<D; j++){
A_h[i*D + j] = drand48();
B_h[i*D + j] = drand48();
}
}
cudaCheck(cudaMalloc((void**)&A_d, len*sizeof(float)));
cudaCheck(cudaMalloc((void**)&B_d, len*sizeof(float)));
cudaCheck(cudaMalloc((void**)&C_d, len*sizeof(float)));
cudaCheck(cudaMemcpy(A_d, A_h, len*sizeof(float), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(B_d, B_h, len*sizeof(float), cudaMemcpyHostToDevice));
cublasHandle_t cb_handle;
cublasCheck(cublasCreate(&cb_handle));
cublasSetPointerMode(cb_handle, CUBLAS_POINTER_MODE_DEVICE);
matrixMult(CUBLAS_OP_N, CUBLAS_OP_N, D, D, D, 1.0, B_d, A_d, 0.0, C_d, &cb_handle);
cublasDestroy(cb_handle);
cudaCheck(cudaMemcpy(C_h, C_d, len*sizeof(float), cudaMemcpyDeviceToHost));
cudaCheck(cudaFree(A_d));
cudaCheck(cudaFree(B_d));
cudaCheck(cudaFree(C_d));
free(A_h);
free(B_h);
free(C_h);
}
void matrixMult(cublasOperation_t transA, cublasOperation_t transB, int M, int N,
int K, float alpha, float *A, float *B, float beta, float *C,
cublasHandle_t *cb_handle){
int lda = (transA == CUBLAS_OP_N) ? K : M;
int ldb = (transB == CUBLAS_OP_N) ? N : K;
int ldc = N;
cublasCheck(cublasSgemm(*cb_handle, transB, transA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));
}
使用以下简单的错误捕获标题: -
#ifndef CUDA_ERROR_CHECK
#define CUDA_ERROR_CHECK
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define cudaCheck(ans){cuda_assert((ans), __FILE__, __LINE__);}
#define cublasCheck(ans){cublas_assert((ans), __FILE__, __LINE__);}
inline void cuda_assert(cudaError_t code, const char *file, int line){
if(code != cudaSuccess){
fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
inline void cublas_assert(cublasStatus_t code, const char *file, int line){
if(code != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "CUBLAS Error! %s line: %d error code: %d\n", file, line, code);
exit(code);
}
}
#endif
请注意,上述误差输出由上述方阵示例产生。我的张量积应用得到了类似的输出。
我正在使用带有Titan Black卡的CUDA 7.5。我做了一些根本错误的事情,还是可能是我的cuBLAS安装问题?
答案 0 :(得分:1)
如果你消除了这个:
cublasSetPointerMode(cb_handle, CUBLAS_POINTER_MODE_DEVICE);
您的代码将正常运行。不清楚为什么要将指针模式设置为CUBLAS_POINTER_MODE_DEVICE
。 documentation表示:
有两类使用标量参数的函数:
通过主机或设备上的引用获取alpha和/或beta参数作为缩放因子的函数,例如gemm
在主机或设备上返回标量结果的函数,如amax(),amin,asum(),rotg(),rotmg(),dot()和nrm2()。
对于第一类的函数,当指针模式设置为
CUBLAS_POINTER_MODE_HOST
时,标量参数alpha和/或beta可以在堆栈上或在堆上分配。
CUBLAS_POINTER_MODE_HOST
是默认设置,在您的情况下,它是正确的设置,其中&alpha
和&beta
是指向主机内存的指针:< / p>
cublasCheck(cublasSgemm(*cb_handle, transB, transA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc));