我已经编写了一个用于模拟John Conway's Game of Life的CUDA程序,但有时我会出现内存泄漏(出现不应该出现的单元格)。
这是我的内核:
__global__ void gameOfLife(matrix pcuda_main,int lblock,int generations_to_run) {
/************************************
notice: first dimension is one section for read and one for
write(change purpose every simulation generation). 2 extra section rows
for first and last line not need to check their up and down position evry time
also edge lines ensure that blocks handles edge sectors are having edge lines less that other blocks
************************************/
__shared__ unsigned int section[2][SECTION_SIZE][CELLS_IN_LINE];
int i,j;
unsigned int sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b;
unsigned int left_top,left,left_bot,right_top,right,right_bot;
int read_section=0;
int write_section=1;
int bx = blockIdx.x;
int row = SECTION_ROWS*blockIdx.x+threadIdx.x;
int rowx = threadIdx.x+SECTION_ROWS;
// I am zeroeing the perimiters lines since they dont loaded with values and can be corrupt while more lines may be zeroes its done for avoiding ifs
section[0][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[0][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[0][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[0][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
__syncthreads(); // ensure no crashes between zeroeing perimiter lines and loading data
// since entire warp access the first and last cell together no extrag edges are needed however only first and last line access lines beyond the edges
for(i=0;i<CELLS_IN_LINE;i++) {
if ( bx > 0 ) {
section[0][rowx-SECTION_ROWS+1][i] = pcuda_main[((row-SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
}
section[0][rowx+1][i] = pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i];
if ( bx < lblock ) {
// not last block sector row
section[0][rowx+SECTION_ROWS+1][i] = pcuda_main[((row+SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
}
}
__syncthreads(); // ensure all data read
for ( i=0;i<generations_to_run;i++ ) {
for(j=0;j<CELLS_IN_LINE;j++) {
if ( bx > 0 ) {
if ( j > 0 ) {
left_top = section[read_section][rowx-SECTION_ROWS][j-1];
left = section[read_section][rowx-SECTION_ROWS+1][j-1];
left_bot = section[read_section][rowx-SECTION_ROWS+2][j-1];
} else {
left_top = 0;
left=0;
left_bot=0;
}
if ( j<CELLS_IN_LINE_RESIDUE ) {
right_top= section[read_section][rowx-SECTION_ROWS][j+1];
right= section[read_section][rowx-SECTION_ROWS+1][j+1];
right_bot= section[read_section][rowx-SECTION_ROWS+2][j+1];
} else {
right_top = 0;
right=0;
right_bot=0;
}
CELL32(section[write_section][rowx-SECTION_ROWS+1][j],
left_top,section[read_section][rowx-SECTION_ROWS][j],right_top,
left,section[read_section][rowx-SECTION_ROWS+1][j],right,
left_bot,section[read_section][rowx-SECTION_ROWS+2][j],right_bot,
sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
}
if ( j > 0 ) {
left_top = section[read_section][rowx][j-1];
left = section[read_section][rowx+1][j-1];
left_bot = section[read_section][rowx+2][j-1];
} else {
left_top = 0;
left=0;
left_bot=0;
}
if ( j<CELLS_IN_LINE_RESIDUE ) {
right_top= section[read_section][rowx][j+1];
right= section[read_section][rowx+1][j+1];
right_bot= section[read_section][rowx+2][j+1];
} else {
right_top = 0;
right=0;
right_bot=0;
}
CELL32(section[write_section][rowx+1][j],
left_top,section[read_section][rowx][j],right_top,
left,section[read_section][rowx+1][j],right,
left_bot,section[read_section][rowx+2][j],right_bot,
sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
if ( bx < lblock ) {
if ( j > 0 ) {
left_top = section[read_section][rowx+SECTION_ROWS][j-1];
left = section[read_section][rowx+SECTION_ROWS+1][j-1];
left_bot = section[read_section][rowx+SECTION_ROWS+2][j-1];
} else {
left_top = 0;
left=0;
left_bot=0;
}
if ( j<CELLS_IN_LINE_RESIDUE ) {
right_top= section[read_section][rowx+SECTION_ROWS][j+1];
right= section[read_section][rowx+SECTION_ROWS+1][j+1];
right_bot= section[read_section][rowx+SECTION_ROWS+2][j+1];
} else {
right_top = 0;
right=0;
right_bot=0;
}
CELL32(section[write_section][rowx+SECTION_ROWS+1][j],
left_top,section[read_section][rowx+SECTION_ROWS][j],right_top,
left,section[read_section][rowx+SECTION_ROWS+1][j],right,
left_bot,section[read_section][rowx+SECTION_ROWS+2][j],right_bot,
sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
}
}
read_section = read_section^1;
write_section = write_section^1;
//printf("passed %u generation for row: %u\n",i,row);
__syncthreads();
}
// now writing back to the global memory notice write section turns into read section after every generation so
// I write the read section
for(i=0;i<CELLS_IN_LINE;i++) {
pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i] = section[read_section][rowx+1][i];
}
__syncthreads();
}
这里是单元格32的定义及其依赖:
#define HALFADDER(s0,s1,a0,a1)do{s1=(a0)&(a1);s0=(a0)^(a1);}while(0)
#define FULLADDER(s0,s1,a0,a1,a2)do{s1=((a0)&(a1))|((a2)&((a0)^(a1)));s0 =(a2)^((a0)^(a1));}while(0)
#define CELL32(output,top_left,top,top_right,left,cur,right,bot_left,bot,bot_right,sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b)do{FULLADDER(sum_top_b0,sum_top_b1,(top_left<<31)|(top>>1),top,(top_right>>31)|(top<<1));HALFADDER(sum_cur_b0,sum_cur_b1,(left<<31)|(cur>>1),(right>>31)|(cur<<1));FULLADDER(sum_bot_b0,sum_bot_b1,(bot_left<<31)|(bot>>1),bot,((bot_right>>31)|(bot<<1)));FULLADDER(newone,newtwo,sum_bot_b0,sum_cur_b0,sum_top_b0);FULLADDER(newtwo,new4a,newtwo,sum_bot_b1,sum_top_b1);HALFADDER(newtwo,new4b,newtwo,sum_cur_b1);newone=newone|cur;output=newone&newtwo&(~new4a)&(~new4b);}while(0)
这里是主循环和内存副本:
cudaMalloc((void **)(&pcuda),NUM_CELLS*sizeof(unsigned int));
//cudaMalloc((void **)(&pdata),ROWS*sizeof(int));
cudaMemcpy((void *)pcuda,(void *)p,sizeof(unsigned int)*NUM_CELLS,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
while ( generations > 0 ) {
if ( generations > SECTION_ROWS ) {
generations_run = SECTION_ROWS;
} else {
generations_run = generations;
}
generations -= generations_run;
printf("running params last_row:%u, generations_run:%u, generations left:%u,grid size:%d, array size in bytes: %u,last cell index: %d\n",
ROWS-SECTION_ROWS,generations_run,generations,dimGrid.x,NUM_CELLS*sizeof(unsigned int),((ROWS-1)<<LINE_NUMBER_BITS_OFFSET)+CELLS_IN_LINE_RESIDUE);
gameOfLife<<<dimGrid,dimBlock>>>(pcuda,dimGrid.x-1,generations_run);
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
lineid = __LINE__;
err = error;
}
else
{
printf("GPU Device %d:synchronized\n", devID);
}
}
cudaMemcpy((void *)p,(void *)pcuda,sizeof(int)*NUM_CELLS,cudaMemcpyDeviceToHost);
内存泄露在哪里?
答案 0 :(得分:3)
CUDA工具包附带cuda-memcheck,默认情况下会检查内核中的越界访问。它还有其他模式,包括泄漏检查器。请注意,您需要在退出之前调用cudaDeviceReset(),以便工具知道查找未发布的设备内存。