我在查找内存分配错误时遇到了一些麻烦。我目前在GeForce GT 630上使用Visual Studio 2013,Matlab 2015b和CUDA 7.0,我是GPU编程,CUDA和mex的新手。
当我使用mexcuda从Matlab调用我的代码时,它会很好,直到我将带有colIndexStepSize的小部分添加到.cu文件中。程序正常运行直到删除。在通知我这里遇到断点之后,Matlab崩溃了。
当我删除有问题的代码行时,一切都会顺利运行。
我很确定我的内存处理有问题但我简直找不到这个bug。以下是制造麻烦的代码:
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusparse.h>
#include <device_launch_parameters.h>
#include <curand.h>
#include <vector>
// Test-Makro : (Funktionieren die Zugriffe auf die GPU?)
#define gpuErrchk(ans){gpuAssert((ans), __FILE__, __LINE__);}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true){
if (code != cudaSuccess){
fprintf(stderr, "GPUassert: %s%s%d\n", cudaGetErrorString(code), file, line);
}
}
__global__ void startEndIndex(int *ergArray, int *first, int *last, float *dxmax, unsigned int *length){
unsigned int index = threadIdx.x + blockIdx.x*blockDim.x;
if (index < *length){
first[index] = (*dxmax)*ergArray[index];
last[index] = (*dxmax)*ergArray[index + 1] - 1;
}
}
void rotateOSSARTrechnung(std::vector<float> *detektor, SparseMatrix<float, float, float> *systemMatrix_coo, Volumen<float, float, float> *volumen, unsigned int iterationen, std::vector<float> *deltaBIterationN, std::vector<float> *matdVoxelGrid, float projektionen,float dxmax, float detZellen, unsigned int threads_max_n, unsigned int threads_max_m, unsigned int threads_max_nnz){
unsigned int nnz = (unsigned int)systemMatrix_coo->nnz;
unsigned int n = (unsigned int)systemMatrix_coo->columnNumber;
unsigned int mNeu = detZellen;
float *measuredValues = 0; measuredValues = new float[mNeu]();
float *volumeN = 0; volumeN =new float[n]();
float *volumeAlt = 0; volumeAlt = new float[n]();
float *initValuesM = 0; initValuesM = new float[mNeu]();
float *volumeNInitZero = 0; volumeNInitZero = new float[n]();
float *initValuesMInitZero = 0; initValuesMInitZero = new float[mNeu]();
int *cooRowHostPtr=0; cooRowHostPtr = new int[nnz]();
int *cooColHostPtr=0; cooColHostPtr = new int[nnz]();
float *cooValuesHostPtr = 0; cooValuesHostPtr = new float[nnz]();
unsigned int *colIndex = 0; colIndex = new unsigned int[nnz]();
float *valIndex = 0; valIndex = new float[nnz]();
unsigned int *colIndexStepSize = 0; colIndexStepSize = new unsigned int[n]();
for (unsigned int i = 0; i < n; i++){
colIndexStepSize[i] = nnz;
}
unsigned int length = matdVoxelGrid->size();
int *ergArray = 0; ergArray = new int[length+1]();
int *first = 0; first = new int[length]();
int *last = 0; last = new int[length]();
int *cooHostColRot = 0; cooHostColRot = new int[nnz]();
int *d_cooColPtr;
int *d_cooRowPtr;
unsigned int *d_nnz;
int *d_colIndexPtr;
float *d_valIndexPtr;
unsigned int *d_colIndexStepSizePtr;
float *d_cooValuesPtr;
float *d_measuredValues;
float *d_volume_alt;
float *d_volume_neu;
int *d_ergArray;
float *d_dxmax;
unsigned int *d_length;
unsigned int *d_size;
int *d_first;
int *d_last;
int *d_cooColRotPtr;
unsigned int *d_count;
gpuErrchk(cudaMalloc((void**)&d_cooRowPtr, nnz*sizeof(int)));;
gpuErrchk(cudaMalloc((void**)&d_cooColPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooValuesPtr, nnz*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_measuredValues, mNeu*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_alt, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_neu, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_nnz, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_colIndexPtr, (nnz)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_valIndexPtr, (nnz)*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_colIndexStepSizePtr, n*sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_ergArray, (length+1)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_dxmax, sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_length, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_size, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_first, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_last, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooColRotPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_count, sizeof(unsigned int)));
for (unsigned int i = 0; i < nnz; i++){
cooRowHostPtr[i] = systemMatrix_coo->cooRowInd->at(i);
cooColHostPtr[i] = systemMatrix_coo->cooColInd->at(i);
cooValuesHostPtr[i] = systemMatrix_coo->cooValues->at(i);
}
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, (float)cooColHostPtr[j]);
}
gpuErrchk(cudaMemcpy(d_nnz, &nnz, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dxmax, &dxmax, sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_length, &length, sizeof(unsigned int), cudaMemcpyHostToDevice));
// (Initialwerte sind immer gleich)
gpuErrchk(cudaMemcpy(d_cooRowPtr, cooRowHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooValuesPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooColPtr, cooColHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_valIndexPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
unsigned int threads_nnz = threads_max_nnz;
unsigned int thread_length = length;
unsigned int block_length = 1;
unsigned int index = 0;
for (unsigned int s = 0; s < length; s++){
for (unsigned int t = 0; t <= s; t++){
index = s + 1;
ergArray[index] += (int)matdVoxelGrid->at(t);
}
}
gpuErrchk(cudaMemcpy(d_ergArray, ergArray, (length+1)*sizeof(int), cudaMemcpyHostToDevice));
startEndIndex <<< block_length, thread_length >>>(d_ergArray, d_first, d_last, d_dxmax, d_length);
gpuErrchk(cudaMemcpy(first, d_first, length*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(last, d_last, length*sizeof(int), cudaMemcpyDeviceToHost));
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)first[j]);
}
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)last[j]);
}
unsigned int size = 0;
for (unsigned int iter = 0; iter < iterationen; iter++){
for (unsigned int proj = 1; proj <= projektionen; proj++){
unsigned int begin1 = (proj - 1)*mNeu;
unsigned int end1 = proj*mNeu;
for (unsigned int j = begin1; j < end1; j++){
measuredValues[j] = detektor->at(j);
}
gpuErrchk(cudaMemcpy(d_measuredValues, measuredValues, mNeu*sizeof(float), cudaMemcpyHostToDevice));
for (unsigned int u = 0; u < length; u++){
size = ceil(matdVoxelGrid->at(u)* (proj - 1) * dxmax / projektionen);
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_count, &u, sizeof(unsigned int), cudaMemcpyHostToDevice));
if (proj > 1){
for (unsigned int i = 0; i < nnz; i++) {//(first[u] <= cooCols[index] <= last[u]){
if (first[u] <= cooColHostPtr[i] && cooColHostPtr[i] <= last[u]){
cooHostColRot[i] = first[u] + (int)(cooColHostPtr[i] + size) % (last[u] - first[u] + 1);// (int)(cooColHostPtr[i] + size) % (last[u]); // (int)(first[u] + ((int)(cooColHostPtr[i] + dxmax) % (last[u] - first[u] + 1)));
}
}
}
else{
for (unsigned int i = 0; i < nnz; i++) {
cooHostColRot[i] = cooColHostPtr[i];
}
}
}
// --------- troubling code starts HERE ----------------
unsigned int wert = 0, index = 0;
for (unsigned int i = 0; i < nnz; i++){
index = cooHostColRot[i];
wert = colIndexStepSize[index];
if (wert >= i){
colIndexStepSize[index] = i;
}
}
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, colIndexStepSize[j]);
}
gpuErrchk(cudaMemcpy(d_colIndexStepSizePtr, colIndexStepSize, n*sizeof(unsigned int), cudaMemcpyHostToDevice));
// --------- troubling code ends HERE ----------------
gpuErrchk(cudaMemcpy(d_colIndexPtr, cooHostColRot, nnz*sizeof(int), cudaMemcpyHostToDevice));
}
}
cudaFree(d_cooRowPtr);
cudaFree(d_cooColPtr);
cudaFree(d_cooValuesPtr);
cudaFree(d_measuredValues);
cudaFree(d_volume_alt);
cudaFree(d_volume_neu);
cudaFree(d_colCount);
cudaFree(d_rowCount);
cudaFree(d_ergSumCol);
cudaFree(d_ergSumRow);
cudaFree(d_ergMult);
cudaFree(d_nnz);
cudaFree(d_faktor);
cudaFree(d_colIndexPtr);
cudaFree(d_valIndexPtr);
cudaFree(d_ergSumNNZforCol);
cudaFree(d_colIndexStepSizePtr);
cudaFree(d_deltaB);
cudaFree(d_ergArray);
cudaFree(d_dxmax);
cudaFree(d_length);
cudaFree(d_size);
cudaFree(d_first);
cudaFree(d_last);
cudaFree(d_cooColRotPtr);
cudaFree(d_count);
delete[](ergArray); ergArray = NULL;
delete[](measuredValues); measuredValues = NULL;
delete[](cooColHostPtr); cooColHostPtr = NULL;
delete[](cooRowHostPtr); cooRowHostPtr = NULL;
delete[](cooValuesHostPtr); cooValuesHostPtr = NULL;
delete[](volumeN); volumeN = NULL;
delete[](ergArray); ergArray = NULL;
delete[](initValuesM); initValuesM = NULL;
delete[](colIndex); colIndex = NULL;
delete[](valIndex); valIndex = NULL;
delete[](volumeAlt); volumeAlt = NULL;
delete[](volumeNInitZero); volumeNInitZero = NULL;
delete[](initValuesMInitZero); initValuesMInitZero = NULL;
delete[](colIndexStepSize); colIndexStepSize = NULL;
delete[](deltaBArray); deltaBArray = NULL;
delete[](first); first = NULL;
delete[](last); last = NULL;
delete[](cooHostColRot); cooHostColRot = NULL;
deltaB->~vector();
deltaB = NULL;
}
如果有人发现我犯了任何错误,请告诉我,我愿意接受任何建议。
提前致谢! 最好的问候
修改 @AnderBiguri是对的,我对数组 measuredValues 进行了界限访问。以下是相关代码的更正部分:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
}
measuredValues 只有mNeu元素很长但我确实在这一点后面访问了一些元素。
所以,非常感谢你的帮助!