如果我改变while循环(参见下面的内核,它是一个怪异的循环,你不能错过它)只迭代一次,它使用的内存数量可以忽略不计。但是,当循环允许迭代50,000次,如下所示,GPU立即占用2.5 GB。即使使用“for”循环,问题仍然存在。有人可以提供一个解释,也许是一个解决方案,以防止内核使用这么多内存?这种行为非常不寻常,IMO。提前谢谢!
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
__global__ void myKern(const float *transMatrix, float *masterForces, const double *rands, const int r_max)
{
const int iterationsx = 50000;
const int RUsizex = 26;
int threadsPerBlock = blockDim.x * blockDim.y;
int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
int threadId = threadIdx.x + (threadIdx.y * blockDim.x);
int globalIdx = (blockId * threadsPerBlock) + threadId;
int RU[RUsizex] = {0};
int index = 0;
float r = 0.0;
double temp = 0;
float forces[iterationsx] = {0.0};
int left[RUsizex - 2] = {0};
int right[RUsizex - 2] = {0};
curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
int i= 0;
while( i < iterationsx)
{
for(int k = 0; k < RUsizex - 2; k++)
{
left[k] = RU[k];
right[k] = RU[k+2];
}
for(int j = 0; j < RUsizex -2; j++)
{
r = curand_uniform(&s);
index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ;
RU[j + 1]= (RU[j + 1]) + ( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;
}
for(int z = 1; z < RUsizex - 1; z++)
{
temp = temp + (RU[z] ==4) + (RU[z] ==5);
}
forces[i] = temp/(24.0);
temp = 0.0;
i++;
}
for(int y = 0; y < iterationsx; y++)
{
masterForces[globalIdx + (r_max * y)] = forces[y];
}
}
答案 0 :(得分:2)
变量float forces [iterationsx]是全局函数中的堆栈变量。这需要堆栈预留&gt;每个线程200,000B。 CUDA驱动程序必须使用公式SmCount * MaxTheadsPerSm *(LocalMemoryPerThread + StackPerThread)根据最大驻留线程分配本地内存分配。对于完整的GK110,这将是15 * 2048 * ~51KiB = 1.5 GiB。