Question

如果没有CUBLAS_STATUS_EXECUTION_FAILED（13）输出，我无法运行cublasStrsmBatched（第113行）。为简化起见，所有矩阵值和alpha均为1.0，所有矩阵均为正方形，lda，ldb，m和n相等。我能够以相同的方式运行cublasSgemmBatched和cublasStrsm，没有错误。 cublasStrsmBatched应该是相同的，但它不是，不适合我。如果您对此代码中的错误有任何疑问，请告诉我：



#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

cublasHandle_t handle;

void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);

void TestCublasStrsmBatched(int size, int numOfLinSys);

int main()
{
    cublasStatus_t status = cublasCreate(&handle);
    CheckCublasCreate(status);

    /*arguments are size of square matrix 
    and number of linear systems*/
    TestCublasStrsmBatched(2,2);

    status = cublasDestroy(handle);
    CheckCublasDestroy(status);
}

void TestCublasStrsmBatched(int size, int numOfLinSys)
{
    cublasStatus_t status;
    cudaError_t error;
    float **h_A;
    float **d_A;
    float **h_B;
    float **d_B;
    float **hd_A;
    float **hd_B;
    float *alpha;

    const int n = size;
    const int m = size;
    const int lda=m;
    const int ldb=m;
    const int matA_numOfElem = m*m;
    const int matB_numOfElem = m*n;

    int i,j;

    h_A = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(h_A);

    h_B = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(h_B);

    alpha=(float *)malloc(sizeof(float));
    *alpha = 1.0;

    for (j=0; j<numOfLinSys; j++){
        h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
        CheckAllocateHost(h_A);
        for (i=0; i < matA_numOfElem; i++) 
            h_A[j][i] = 1.0;

        h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
        CheckAllocateHost(h_B);
        for (i=0; i < matB_numOfElem; i++)
            h_B[j][i] = 1.0;
        }

    hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(hd_A);

    hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(hd_B);

    for (j=0; j<numOfLinSys; j++){
        error = cudaMalloc((void **)&hd_A[j], 
                           matA_numOfElem * sizeof(float));
        CheckCudaMalloc(error);

        error = cudaMalloc((void **)&hd_B[j], 
                           matB_numOfElem * sizeof(float));
        CheckCudaMalloc(error);

        status = cublasSetMatrix(m, m, sizeof(float), 
                                 h_A[j], lda, hd_A[j], lda);
        CheckCublasSetGetMatrix(status);

        status = cublasSetMatrix(m, n, sizeof(float), 
                                 h_B[j], ldb, hd_B[j], ldb);
        CheckCublasSetGetMatrix(status);
        }

    error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
    CheckCudaMalloc(error);

    error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
    CheckCudaMalloc(error);

    error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*), 
                       cudaMemcpyHostToDevice);
    CheckCudaMemcpy(error);

    error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*), 
                       cudaMemcpyHostToDevice);
    CheckCudaMemcpy(error);

    /*After cublasStrsmBatched call 
    status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
    status = cublasStrsmBatched(handle,
                                CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
                                CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
                                m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
    CheckKernelExecution(status);
}


void CheckCublasCreate( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, 
                "!!!! CUBLAS initialization error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckAllocateHost( void* h_pointer )
{
    if (h_pointer == 0){
        fprintf(stderr, 
                "!!!! host memory allocation error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckCudaMalloc( cudaError_t error )
{
    if (error != cudaSuccess){
        fprintf(stderr, 
                "!!!! device memory allocation error (error code %s)\n", 
                cudaGetErrorString(error));
        exit(EXIT_FAILURE);
        }
}

void CheckCudaMemcpy( cudaError_t error )
{
    if (error != cudaSuccess){
        fprintf(stderr, "!!!! data copy error (error code %s)\n", 
                cudaGetErrorString(error));
        exit(EXIT_FAILURE);
        }
}

void CheckCublasSetGetMatrix( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! device access error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckKernelExecution( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! kernel execution error.\n");
        exit(EXIT_FAILURE);
        }
}

void CheckCublasDestroy( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! shutdown error \n");
        exit(EXIT_FAILURE);
        }
}

#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #include <cublas_v2.h> cublasHandle_t handle; void CheckCublasCreate(cublasStatus_t status); void CheckAllocateHost(void* h_pointer); void CheckCudaMalloc(cudaError_t d_allocStatus); void CheckCudaMemcpy( cudaError_t error ); void CheckCublasSetGetMatrix(cublasStatus_t status); void CheckKernelExecution(cublasStatus_t status); void CheckCublasDestroy(cublasStatus_t status); void TestCublasStrsmBatched(int size, int numOfLinSys); int main() { cublasStatus_t status = cublasCreate(&handle); CheckCublasCreate(status); /*arguments are size of square matrix and number of linear systems*/ TestCublasStrsmBatched(2,2); status = cublasDestroy(handle); CheckCublasDestroy(status); } void TestCublasStrsmBatched(int size, int numOfLinSys) { cublasStatus_t status; cudaError_t error; float **h_A; float **d_A; float **h_B; float **d_B; float **hd_A; float **hd_B; float *alpha; const int n = size; const int m = size; const int lda=m; const int ldb=m; const int matA_numOfElem = m*m; const int matB_numOfElem = m*n; int i,j; h_A = (float **)malloc(numOfLinSys * sizeof(float*)); CheckAllocateHost(h_A); h_B = (float **)malloc(numOfLinSys * sizeof(float*)); CheckAllocateHost(h_B); alpha=(float *)malloc(sizeof(float)); *alpha = 1.0; for (j=0; j<numOfLinSys; j++){ h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float)); CheckAllocateHost(h_A); for (i=0; i < matA_numOfElem; i++) h_A[j][i] = 1.0; h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float)); CheckAllocateHost(h_B); for (i=0; i < matB_numOfElem; i++) h_B[j][i] = 1.0; } hd_A = (float **)malloc(numOfLinSys * sizeof(float*)); CheckAllocateHost(hd_A); hd_B = (float **)malloc(numOfLinSys * sizeof(float*)); CheckAllocateHost(hd_B); for (j=0; j<numOfLinSys; j++){ error = cudaMalloc((void **)&hd_A[j], matA_numOfElem * sizeof(float)); CheckCudaMalloc(error); error = cudaMalloc((void **)&hd_B[j], matB_numOfElem * sizeof(float)); CheckCudaMalloc(error); status = cublasSetMatrix(m, m, sizeof(float), h_A[j], lda, hd_A[j], lda); CheckCublasSetGetMatrix(status); status = cublasSetMatrix(m, n, sizeof(float), h_B[j], ldb, hd_B[j], ldb); CheckCublasSetGetMatrix(status); } error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*)); CheckCudaMalloc(error); error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*)); CheckCudaMalloc(error); error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*), cudaMemcpyHostToDevice); CheckCudaMemcpy(error); error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*), cudaMemcpyHostToDevice); CheckCudaMemcpy(error); /*After cublasStrsmBatched call status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/ status = cublasStrsmBatched(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys); CheckKernelExecution(status); } void CheckCublasCreate( cublasStatus_t status ) { if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! CUBLAS initialization error \n"); exit(EXIT_FAILURE); } } void CheckAllocateHost( void* h_pointer ) { if (h_pointer == 0){ fprintf(stderr, "!!!! host memory allocation error \n"); exit(EXIT_FAILURE); } } void CheckCudaMalloc( cudaError_t error ) { if (error != cudaSuccess){ fprintf(stderr, "!!!! device memory allocation error (error code %s)\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } } void CheckCudaMemcpy( cudaError_t error ) { if (error != cudaSuccess){ fprintf(stderr, "!!!! data copy error (error code %s)\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } } void CheckCublasSetGetMatrix( cublasStatus_t status ) { if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! device access error \n"); exit(EXIT_FAILURE); } } void CheckKernelExecution( cublasStatus_t status ) { if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! kernel execution error.\n"); exit(EXIT_FAILURE); } } void CheckCublasDestroy( cublasStatus_t status ) { if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! shutdown error \n"); exit(EXIT_FAILURE); } }

使用Linux，CUDA 5.5，T10和Windows，CUDA 5.5，GTX285

谢谢！

Answer 1

批量生产的三角形后轮是我以前在CUBLAS中没有尝试过的东西，所以我有兴趣看看可能会发生什么。你的代码相当复杂，所以我没有费心去理解它，但是当我运行它时，它似乎失败了内部CUBLAS启动失败：

$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
=========     Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
=========     Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
=========     Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
=========     Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
=========     Host Frame:[0x1]

（这是一台带有计算1.2 GPU和CUDA 5.0的OS X机器）。错误8是cudaErrorInvalidDeviceFunction，通常只有当库或fatbinary没有匹配或不能将JIT重新编译为GPU可以运行的内容的架构时才会出现。

好奇，我从头开始编写了我自己更简单的repro案例：

#include <iostream>
#include <cublas_v2.h>

int main(void)
{
    const int Neq = 5, Nrhs = 2, Nsys = 4;

    float Atri[Neq][Neq] = 
        { { 1,  6, 11, 16, 21},
        { 0,  7, 12, 17, 22},
        { 0,  0, 13, 18, 23},
        { 0,  0,  0, 19, 24},
        { 0,  0,  0,  0, 25} };

    float B[Nrhs][Neq] = 
        { {  1,  27, 112, 290, 595},
        {  2,  40, 148, 360, 710} };


    float *syslhs[Nsys], *sysrhs[Nsys];
    float *A_, *B_, **syslhs_, **sysrhs_;

    size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
    size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);

    cudaMalloc((void **)(&A_), Asz);
    cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));

    cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
    for(int i=0; i<Nsys; i++) {
        syslhs[i] = A_;
        sysrhs[i] = (float*)((char *)B_ + i*Bsz);
        cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
    }

    size_t syssz = sizeof(float *) * (size_t)Nsys;
    cudaMalloc((void **)&syslhs_, syssz);
    cudaMalloc((void **)&sysrhs_, syssz);
    cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
    cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);

    const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
    const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
    const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
    const cublasOperation_t trans = CUBLAS_OP_N;
    float alpha = 1.f;

    cublasHandle_t handle;
    cublasCreate(&handle);

    cublasStrsmBatched(
                handle,
                side, ulo, trans, diag,
                Neq, Nrhs,
                &alpha, 
                syslhs_, Neq,
                sysrhs_, Neq,
                Nsys
                );


    for(int k=0; k<Nsys; k++) {
        cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
        for(int i=0; i<Nrhs; i++) {
            for(int j=0; j<Neq; j++) {
                std::cout << B[i][j] << ",";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

    return 0;
}

这也与代码失败的方式相同。在第一次检查时，这确实似乎是一个CUBLAS内部问题，尽管很难说出什么。关于我唯一能想到的是这些解算器仅支持计算能力3.5设备在计算1.x设备上不受支持，但文档未提及它。在我们之间，我们已经测试了计算1.2，计算1.3和~~计算3.0~~ [我的错误，我在你的问题中读取了K10而不是T10]设备，所以没有其他的东西......

我所能建议的是尝试使用cuda-memcheck运行代码并查看它是否报告相同的错误。如果确实如此，我会在未来看到NVIDIA的错误报告。

编辑：我公然无视EULA并使用cuobjdump来探索CUDA 5 cublas图书馆中的cubin有效载荷。对于单精度批量trsm例程，我找到了

的立方体

32位sm_20
32位sm_30
32位sm_35
64位sm_20
64位sm_30
64位sm_35

库中显然没有sm_1x cubins，因此我的compute_12设备应该产生我看到的运行时库错误。它还解释了GTX 285和Telsa T10的错误，它们都是compute_13。

EDIT2：

正如所怀疑的那样，我的repro代码在带有compute_30设备的Linux系统上完美运行，在CUDA 5.0和CUDA 5.5版本库下。

cublasStrsmBatched - 执行失败

1 个答案: