mexcuda在.cu文件

时间:2016-06-09 14:52:45

标签: c++ matlab cuda mex

我在查找内存分配错误时遇到了一些麻烦。我目前在GeForce GT 630上使用Visual Studio 2013,Matlab 2015b和CUDA 7.0,我是GPU编程,CUDA和mex的新手。

当我使用mexcuda从Matlab调用我的代码时,它会很好,直到我将带有colIndexStepSize的小部分添加到.cu文件中。程序正常运行直到删除。在通知我这里遇到断点之后,Matlab崩溃了。

当我删除有问题的代码行时,一切都会顺利运行。

我很确定我的内存处理有问题但我简直找不到这个bug。以下是制造麻烦的代码:

#include <cuda_runtime.h>
#include <cuda.h>
#include <cusparse.h>
#include <device_launch_parameters.h>
#include <curand.h>

#include <vector>

// Test-Makro : (Funktionieren die Zugriffe auf die GPU?)
#define gpuErrchk(ans){gpuAssert((ans), __FILE__, __LINE__);}

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true){
    if (code != cudaSuccess){
        fprintf(stderr, "GPUassert: %s%s%d\n", cudaGetErrorString(code), file, line);
    }
}


__global__ void startEndIndex(int *ergArray, int *first, int *last, float *dxmax, unsigned int *length){

    unsigned int index = threadIdx.x + blockIdx.x*blockDim.x;

    if (index < *length){

        first[index] = (*dxmax)*ergArray[index];
        last[index] = (*dxmax)*ergArray[index + 1] - 1;
    }
}



void rotateOSSARTrechnung(std::vector<float> *detektor, SparseMatrix<float, float, float> *systemMatrix_coo, Volumen<float, float, float> *volumen, unsigned int iterationen, std::vector<float> *deltaBIterationN, std::vector<float> *matdVoxelGrid, float projektionen,float dxmax, float detZellen, unsigned int threads_max_n, unsigned int threads_max_m, unsigned int threads_max_nnz){

unsigned int nnz = (unsigned int)systemMatrix_coo->nnz;
unsigned int n = (unsigned int)systemMatrix_coo->columnNumber; 
unsigned int mNeu = detZellen; 

float *measuredValues = 0; measuredValues = new float[mNeu](); 

float *volumeN = 0; volumeN =new float[n]();
float *volumeAlt = 0; volumeAlt = new float[n](); 

float *initValuesM = 0; initValuesM = new float[mNeu]();

float *volumeNInitZero = 0; volumeNInitZero = new float[n]();
float *initValuesMInitZero = 0; initValuesMInitZero = new float[mNeu]();
int *cooRowHostPtr=0; cooRowHostPtr = new int[nnz]();
int *cooColHostPtr=0; cooColHostPtr = new int[nnz]();
float *cooValuesHostPtr = 0; cooValuesHostPtr = new float[nnz]();

unsigned int *colIndex = 0; colIndex = new unsigned int[nnz]();
float *valIndex = 0; valIndex = new float[nnz]();
unsigned int *colIndexStepSize = 0; colIndexStepSize = new unsigned int[n]();

for (unsigned int i = 0; i < n; i++){
    colIndexStepSize[i] = nnz;
}

unsigned int length = matdVoxelGrid->size();
int *ergArray = 0; ergArray = new int[length+1]();

int *first = 0; first = new int[length]();
int *last = 0; last = new int[length]();

int *cooHostColRot = 0; cooHostColRot = new int[nnz]();


int *d_cooColPtr;
int *d_cooRowPtr;
unsigned int *d_nnz;

int *d_colIndexPtr;
float *d_valIndexPtr;
unsigned int *d_colIndexStepSizePtr;

float *d_cooValuesPtr;
float *d_measuredValues;

float *d_volume_alt;
float *d_volume_neu;

int *d_ergArray; 

float *d_dxmax;
unsigned int *d_length;
unsigned int *d_size;

int *d_first;
int *d_last;

int *d_cooColRotPtr; 

unsigned int *d_count;

gpuErrchk(cudaMalloc((void**)&d_cooRowPtr, nnz*sizeof(int)));;
gpuErrchk(cudaMalloc((void**)&d_cooColPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooValuesPtr, nnz*sizeof(float)));

gpuErrchk(cudaMalloc((void**)&d_measuredValues, mNeu*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_alt, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_neu, n*sizeof(float)));

gpuErrchk(cudaMalloc((void**)&d_nnz, sizeof(unsigned int)));

gpuErrchk(cudaMalloc((void**)&d_colIndexPtr, (nnz)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_valIndexPtr, (nnz)*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_colIndexStepSizePtr, n*sizeof(unsigned int)));

gpuErrchk(cudaMalloc((void**)&d_ergArray, (length+1)*sizeof(int)));

gpuErrchk(cudaMalloc((void**)&d_dxmax, sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_length, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_size, sizeof(unsigned int)));

gpuErrchk(cudaMalloc((void**)&d_first, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_last, length*sizeof(int)));

gpuErrchk(cudaMalloc((void**)&d_cooColRotPtr, nnz*sizeof(int)));

gpuErrchk(cudaMalloc((void**)&d_count, sizeof(unsigned int)));

for (unsigned int i = 0; i < nnz; i++){
    cooRowHostPtr[i] = systemMatrix_coo->cooRowInd->at(i);
    cooColHostPtr[i] = systemMatrix_coo->cooColInd->at(i);
    cooValuesHostPtr[i] = systemMatrix_coo->cooValues->at(i);
}

for (unsigned int j = 0; j < n; j++){
    volumen->setValueAtElement(j, (float)cooColHostPtr[j]);
}

gpuErrchk(cudaMemcpy(d_nnz, &nnz, sizeof(unsigned int), cudaMemcpyHostToDevice));

gpuErrchk(cudaMemcpy(d_dxmax, &dxmax, sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_length, &length, sizeof(unsigned int), cudaMemcpyHostToDevice));

// (Initialwerte sind immer gleich)
gpuErrchk(cudaMemcpy(d_cooRowPtr, cooRowHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooValuesPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooColPtr, cooColHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));

gpuErrchk(cudaMemcpy(d_valIndexPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));

unsigned int threads_nnz = threads_max_nnz;

unsigned int thread_length = length;
unsigned int block_length = 1;

unsigned int index = 0;

for (unsigned int s = 0; s < length; s++){
    for (unsigned int t = 0; t <= s; t++){
        index = s + 1;
        ergArray[index] += (int)matdVoxelGrid->at(t);
    }
}

gpuErrchk(cudaMemcpy(d_ergArray, ergArray, (length+1)*sizeof(int), cudaMemcpyHostToDevice));

startEndIndex <<< block_length, thread_length >>>(d_ergArray, d_first, d_last, d_dxmax, d_length);

gpuErrchk(cudaMemcpy(first, d_first, length*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(last, d_last, length*sizeof(int), cudaMemcpyDeviceToHost));

for (unsigned int j = 0; j < length; j++){
    volumen->setValueAtElement(j, (float)first[j]);
}


for (unsigned int j = 0; j < length; j++){
    volumen->setValueAtElement(j, (float)last[j]);
}


unsigned int size = 0;

for (unsigned int iter = 0; iter < iterationen; iter++){
    for (unsigned int proj = 1; proj <= projektionen; proj++){

        unsigned int begin1 = (proj - 1)*mNeu;
        unsigned int end1 = proj*mNeu;

        for (unsigned int j = begin1; j < end1; j++){
            measuredValues[j] = detektor->at(j);
        }

        gpuErrchk(cudaMemcpy(d_measuredValues, measuredValues, mNeu*sizeof(float), cudaMemcpyHostToDevice));

        for (unsigned int u = 0; u < length; u++){
            size = ceil(matdVoxelGrid->at(u)* (proj - 1) * dxmax / projektionen);
            gpuErrchk(cudaMemcpy(d_size, &size, sizeof(unsigned int), cudaMemcpyHostToDevice));

            gpuErrchk(cudaMemcpy(d_count, &u, sizeof(unsigned int), cudaMemcpyHostToDevice));

            if (proj > 1){

                for (unsigned int i = 0; i < nnz; i++) {//(first[u] <= cooCols[index] <= last[u]){

                    if (first[u] <= cooColHostPtr[i] && cooColHostPtr[i] <= last[u]){
                        cooHostColRot[i] = first[u] + (int)(cooColHostPtr[i] + size) % (last[u] - first[u] + 1);// (int)(cooColHostPtr[i] + size) % (last[u]); // (int)(first[u] + ((int)(cooColHostPtr[i] + dxmax) % (last[u] - first[u] + 1)));
                    }
                }
            }
            else{
                for (unsigned int i = 0; i < nnz; i++) {
                    cooHostColRot[i] = cooColHostPtr[i];
                }
            }
        }


// --------- troubling code starts HERE ----------------
        unsigned int wert = 0, index = 0;
        for (unsigned int i = 0; i < nnz; i++){
            index = cooHostColRot[i];
            wert = colIndexStepSize[index];

            if (wert >= i){
                colIndexStepSize[index] = i;
            }
        }

        for (unsigned int j = 0; j < n; j++){
            volumen->setValueAtElement(j, colIndexStepSize[j]);
        }

            gpuErrchk(cudaMemcpy(d_colIndexStepSizePtr, colIndexStepSize, n*sizeof(unsigned int), cudaMemcpyHostToDevice));

    // --------- troubling code ends HERE ----------------

             gpuErrchk(cudaMemcpy(d_colIndexPtr, cooHostColRot, nnz*sizeof(int), cudaMemcpyHostToDevice));


        }
    }


    cudaFree(d_cooRowPtr);
    cudaFree(d_cooColPtr);
    cudaFree(d_cooValuesPtr);
    cudaFree(d_measuredValues);
    cudaFree(d_volume_alt);
    cudaFree(d_volume_neu);
    cudaFree(d_colCount);
    cudaFree(d_rowCount);
    cudaFree(d_ergSumCol);
    cudaFree(d_ergSumRow);
    cudaFree(d_ergMult);
    cudaFree(d_nnz);
    cudaFree(d_faktor);
    cudaFree(d_colIndexPtr);
    cudaFree(d_valIndexPtr);
    cudaFree(d_ergSumNNZforCol);
    cudaFree(d_colIndexStepSizePtr);
    cudaFree(d_deltaB);

    cudaFree(d_ergArray);
    cudaFree(d_dxmax);
    cudaFree(d_length);
    cudaFree(d_size);
    cudaFree(d_first);
    cudaFree(d_last);
    cudaFree(d_cooColRotPtr);
    cudaFree(d_count);

    delete[](ergArray); ergArray = NULL;
    delete[](measuredValues); measuredValues = NULL;
    delete[](cooColHostPtr); cooColHostPtr = NULL;
    delete[](cooRowHostPtr); cooRowHostPtr = NULL;
    delete[](cooValuesHostPtr); cooValuesHostPtr = NULL;
    delete[](volumeN); volumeN = NULL;
    delete[](ergArray); ergArray = NULL;
    delete[](initValuesM); initValuesM = NULL;
    delete[](colIndex); colIndex = NULL;
    delete[](valIndex); valIndex = NULL;
    delete[](volumeAlt); volumeAlt = NULL;
    delete[](volumeNInitZero); volumeNInitZero = NULL;
    delete[](initValuesMInitZero); initValuesMInitZero = NULL;
    delete[](colIndexStepSize); colIndexStepSize = NULL;
    delete[](deltaBArray); deltaBArray = NULL;

    delete[](first); first = NULL;
    delete[](last); last = NULL;

    delete[](cooHostColRot); cooHostColRot = NULL;

    deltaB->~vector();
    deltaB = NULL;

}

如果有人发现我犯了任何错误,请告诉我,我愿意接受任何建议。

提前致谢! 最好的问候

修改 @AnderBiguri是对的,我对数组 measuredValues 进行了界限访问。以下是相关代码的更正部分:

        for (unsigned int j = 0; j < mNeu; j++){
            measuredValues[j] = detektor->at((proj-1)*mNeu+j);
        }

measuredValues 只有mNeu元素很长但我确实在这一点后面访问了一些元素。

所以,非常感谢你的帮助!