Question

如果我改变while循环（参见下面的内核，它是一个怪异的循环，你不能错过它）只迭代一次，它使用的内存数量可以忽略不计。但是，当循环允许迭代50,000次，如下所示，GPU立即占用2.5 GB。即使使用“for”循环，问题仍然存在。有人可以提供一个解释，也许是一个解决方案，以防止内核使用这么多内存？这种行为非常不寻常，IMO。提前谢谢！

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>

    __global__ void myKern(const float *transMatrix, float *masterForces, const double *rands, const int r_max)
    {


    const int iterationsx = 50000;
    const int RUsizex = 26;

    int threadsPerBlock = blockDim.x * blockDim.y;
    int blockId = blockIdx.x + (blockIdx.y * gridDim.x);
    int threadId = threadIdx.x + (threadIdx.y * blockDim.x); 
    int globalIdx = (blockId * threadsPerBlock) + threadId;

    int RU[RUsizex] = {0};

    int index = 0;
    float r = 0.0;
    double temp = 0;

    float forces[iterationsx] = {0.0};
    int left[RUsizex - 2] = {0};
    int right[RUsizex - 2] = {0};

    curandState s;
    curand_init (rands[globalIdx] , 0, 0, &s);

    int i= 0;

    while( i < iterationsx)
    {

            for(int k = 0; k < RUsizex - 2; k++)
            {
            left[k] = RU[k];
            right[k] = RU[k+2];
            }

            for(int j = 0; j < RUsizex -2; j++)
            {

                r = curand_uniform(&s);



                index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;



                RU[j + 1]= (RU[j + 1]) + ( r < transMatrix[index]) * (transMatrix[index + 1]) +
                    (! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
                    (! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;



            }


            for(int z = 1; z < RUsizex - 1; z++)
            {
                temp = temp + (RU[z] ==4) + (RU[z] ==5);

            }

            forces[i] = temp/(24.0);
            temp = 0.0;
    i++;
    }


    for(int y = 0; y < iterationsx; y++)
    {
        masterForces[globalIdx + (r_max * y)] = forces[y]; 
    }





    }

Answer 1

变量float forces [iterationsx]是全局函数中的堆栈变量。这需要堆栈预留＆gt;每个线程200,000B。 CUDA驱动程序必须使用公式SmCount * MaxTheadsPerSm *（LocalMemoryPerThread + StackPerThread）根据最大驻留线程分配本地内存分配。对于完整的GK110，这将是15 * 2048 * ~51KiB = 1.5 GiB。

＆＃34;而＆＃34; /＆＃34;对于＆＃34;内核中的循环导致CUDA内存不足错误？

1 个答案: