Question

我实际上正在与CUDA合作，并且我尝试使用此技术优化程序。所以我有一个大内核，我必须在100k +时间和100M +时间之间启动可能数十亿？

所以我读到使用dim3变量允许启动那么多线程（cf：https://devtalk.nvidia.com/default/topic/621867/size-limitation-for-1d-arrays-in-cuda-/?offset=7）

我有一个示例代码（在我的gtx970上）运行某段时间，有时候没有。

#ifndef PROPAGATORSAT_CUH_
# define PROPAGATORSAT_CUH
# define M_PI (3.14159265358979323846)
# define TWO_PI (2 * M_PI)
# define TOTAL_TIME (615359.772)
# define STEP (0.771)
# define NB_IT (TOTAL_TIME / (double)STEP)
# define NB_THREADS (1024)
# define NB_BLOCKS (int)((NB_IT + NB_THREADS - 1) / NB_THREADS)

# include <cmath>
# include <cfloat>
# include <stdio.h>
# include "../common/book.h"
# include "cuda_runtime.h"
# include "device_launch_parameters.h"
# define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

class                           Global
{
public:
    const double                _ITURadEarth = 6378145.0;
    const double                _ITUGravCst = 3.986012E5;
    const double                _ITUJ2 = 0.001082636;
    const double                _J2000AngleDeg = 0;//-79.8058;
    const double                _J2000AngleRad = 0;//TO_RAD(_J2000AngleDeg);
    const double                _ITUAngleRateEarthRot = 4.1780745823E-3;
    const double                _ITUAngleRateEarthRotRad = degToRad(_ITUAngleRateEarthRot);

public:
             __device__ double  myAsin(double angle);
    __host__ __device__ double  myAcos(double angle);
    __host__ __device__ double  negPiToPi(double angle);
    __host__ __device__ double  degToRad(double angle);
             __device__ double  radToDeg(double angle);
};

class                               Cartesian
{
public:
    double                          _X;
    double                          _Y;
    double                          _Z;

private:
    double                          _m;

public:
    __host__ __device__             Cartesian(double x, double y, double z) : _X(x), _Y(y), _Z(z), _m(-1) {}
};

class                       Propagator
{
public:
    double                  _iDeg;
    double                  _a;
    double                  _omega_0;
    double                  _OMEGA_0;
    double                  _omega_r;
    double                  _OMEGA_r;
    double                  _rho;
    double                  _SinI;
    double                  _CosI;
    double                  _p;
    double                  _e;
    double                  _ReKm;
    double                  _n0;
    double                  _n_bar;
    double                  _M0;
    double                  _sqrt_e;
    int                     _orbitCase = -1;
    double                  _WdeltaRad;
    double                  _precessionRateRad;
    double                  _artificialPrecessionRad = DBL_MIN;
    double                  _simulationDuration = DBL_MIN;
    double                  _incrementWdeltaRad;

    void                    propagator(double               smaKm,
                                       double               incDeg,
                                       double               e,
                                       double               raanDeg,
                                       double               aopDeg,
                                       double               trueAnomalyDeg,
                                       bool                 stationKeeping,
                                       double               WdeltaDeg,
                                       bool                 precessionMechanismSupplied,
                                       double               precessionRateDeg);
    __device__ Cartesian    evaluate(double                 timeSec,
                                     double                 simulationDuration,
                                     double                 artificialPrecessionRad,
                                     bool                   ECImode);
    __device__ double       solveKepler(double              M,
                                        double              e,
                                        double              epsilon);
    __device__ Cartesian    rotateOrbitalElements(Cartesian pq0,
                                                  double    omega,
                                                  double    OMEGA,
                                                  double    CosI,
                                                  double    SinI);
};

#endif /* !PROPAGATORSAT_CUH_ */

__host__ __device__ double  Global::myAcos(double angle)
{
    return (acos(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}

__device__ double   Global::myAsin(double angle)
{
    return (asin(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}

__host__ __device__ double  Global::degToRad(double angle)
{
    return (angle * M_PI / 180.0);
}

__device__ double   Global::radToDeg(double angle)
{
    return (angle * 180.0 / M_PI);
}

__host__ __device__ double  Global::negPiToPi(double angle)
{
    double          output;

    output = fmod(angle, TWO_PI);
    output = fmod(angle + TWO_PI, TWO_PI);
    return ((output > M_PI) ? (output - TWO_PI) : (output));
}

void        Propagator::propagator(double smaKm, double incDeg, double e, double raanDeg, double aopDeg, double trueAnomalyDeg, bool stationKeeping, double WdeltaDeg, bool precessionMechanismSupplied, double precessionRateDeg)
{
    double          iRad, trueAnomalyRad, cosV, E, mu;
    Global          global;

    _iDeg = incDeg;
    iRad = global.degToRad(_iDeg);
    _CosI = cos(iRad);
    _SinI = sin(iRad);
    _e = e;
    _a = smaKm;
    trueAnomalyRad = global.degToRad(trueAnomalyDeg);
    if (e == 0)
        _M0 = trueAnomalyRad;
    else
    {
        cosV = cos(trueAnomalyRad);
        E = global.myAcos((e + cosV) / (1 + e * cosV));
        if (global.negPiToPi(trueAnomalyRad) < 0)
            E = M_PI * 2 - E;
        _M0 = E - e * sin(E);
    }
    _OMEGA_0 = global.degToRad(raanDeg);
    _omega_0 = global.degToRad(aopDeg);
    _p = _a * (1 - e * e);
    _ReKm = global._ITURadEarth / 1000;
    mu = global._ITUGravCst;
    _n0 = sqrt(mu / pow(_a, 3));
    _n_bar = _n0 * (1.0 + 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * (1.0 - 1.5 * pow(_SinI, 2)) * pow(1.0 - pow(e, 2), 0.5));
    _OMEGA_r = -1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * _CosI;
    _omega_r = 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * (2.0 - 2.5 * pow(_SinI, 2));
    _sqrt_e = sqrt((1 + e) / (1 - e));
    _WdeltaRad = global.degToRad(WdeltaDeg);
    _precessionRateRad = global.degToRad(precessionRateDeg);
    if (stationKeeping == false)
        _orbitCase = 1;
    else if (precessionMechanismSupplied == false)
        _orbitCase = 2;
    else
        _orbitCase = 3;
}

__device__ Cartesian    Propagator::rotateOrbitalElements(Cartesian pq0, double omega, double OMEGA, double CosI, double SinI)
{
    double              CosOMEGA, SinOMEGA, CosOmega, SinOmega, R11, R12, R13, R21, R22, R23, R31, R32, R33, x, y, z;

    CosOMEGA = cos(OMEGA);
    SinOMEGA = sin(OMEGA);
    CosOmega = cos(omega);
    SinOmega = sin(omega);
    R11 = CosOMEGA * CosOmega - SinOMEGA * SinOmega * CosI;
    R12 = -CosOMEGA * SinOmega - SinOMEGA * CosOmega * CosI;
    R13 = SinOMEGA * SinI;
    R21 = SinOMEGA * CosOmega + CosOMEGA * SinOmega * CosI;
    R22 = -SinOMEGA * SinOmega + CosOMEGA * CosOmega * CosI;
    R23 = -CosOMEGA * SinI;
    R31 = SinOmega * SinI;
    R32 = CosOmega * SinI;
    R33 = CosI;
    x = R11 * pq0._X + R12 * pq0._Y + R13 * pq0._Z;
    y = R21 * pq0._X + R22 * pq0._Y + R23 * pq0._Z;
    z = R31 * pq0._X + R32 * pq0._Y + R33 * pq0._Z;
    Cartesian           cart = Cartesian(x, y, z);

    return (cart);
}
__device__ Cartesian    Propagator::evaluate(double timeSec, double simulationDuration, double artificialPrecessionRad, bool ECImode = true)
{
    double              M, E, v, cosV, sinV, rotationAngleECF, omega, OMEGA;
    Global              global;

    if (_simulationDuration != simulationDuration || _artificialPrecessionRad != artificialPrecessionRad)
    {
        _simulationDuration = simulationDuration;
        _artificialPrecessionRad = artificialPrecessionRad;
        _incrementWdeltaRad = (_WdeltaRad * 2) / _simulationDuration;
    }
    M = _M0 + ((_orbitCase == 3) ? _n0 : _n_bar) * timeSec;
    E = E = (_e == 0) ? M : solveKepler(M, _e, 1e-8);
    v = 2.0 * atan(_sqrt_e * tan(E / 2));
    cosV = cos(v);
    sinV = sin(v);
    _rho = _p / (1 + _e * cosV);
    rotationAngleECF = (ECImode) ? 0 : -1 * (global._J2000AngleRad + timeSec * global._ITUAngleRateEarthRotRad);
    omega = _omega_0 + ((_orbitCase == 3) ? 0 : _omega_r * timeSec);
    OMEGA = _OMEGA_0 + rotationAngleECF + ((_orbitCase == 3) ? 0 : _OMEGA_r * timeSec);
    if (_orbitCase == 1)
        OMEGA += artificialPrecessionRad * timeSec;
    else if (_orbitCase == 2)
        OMEGA += _WdeltaRad * ((2.0 * timeSec / _simulationDuration) - 1);
    else if (_orbitCase == 3)
        OMEGA += _precessionRateRad * timeSec - _WdeltaRad + _incrementWdeltaRad * timeSec;
    Cartesian pq0 = Cartesian(1000 * _rho * cosV, 1000 * _rho * sinV, 0);

    Cartesian positionECI = Propagator::rotateOrbitalElements(pq0, omega, OMEGA, _CosI, _SinI);

    return (positionECI);
}
__device__ double   Propagator::solveKepler(double M, double e, double epsilon)
{
    double          En, Ens;

    En = M;
    Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
    while (abs(Ens - En) > epsilon)
    {
        En = Ens;
        Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
    }
    return (Ens);
}

__global__ void kernel(Propagator *CUDA_prop)
{
    size_t      tid;

    tid = (blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z) * blockDim.x + threadIdx.x;
    //if (tid < NB_IT)
    Cartesian positionNGSOsatECI = CUDA_prop[0].evaluate(STEP * tid, 615359.772, 0);
}

int     main(void)
{
    cudaEvent_t     start, stop;
    HANDLE_ERROR(cudaEventCreate(&start));
    HANDLE_ERROR(cudaEventCreate(&stop));
    HANDLE_ERROR(cudaEventRecord(start, 0));
    Propagator  prop[1], *CUDA_prop;
    dim3        block(1000, 1, 1);
    dim3        thread(1024, 1, 1);

    prop[0].propagator(7847.3, 53, 0, 18, 0, 67.5, true, 5, true, 3.4000000596279278E-05);
    HANDLE_ERROR(cudaMalloc((void **)&CUDA_prop, sizeof(Propagator)));
    HANDLE_ERROR(cudaMemcpy(CUDA_prop, prop, sizeof(Propagator), cudaMemcpyHostToDevice));
    kernel <<< block, thread >>> (CUDA_prop);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    HANDLE_ERROR(cudaFree(CUDA_prop));
    HANDLE_ERROR(cudaEventRecord(stop, 0));
    HANDLE_ERROR(cudaEventSynchronize(stop));
    float           elapsedTime;
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
    printf("time : %f ms\n", elapsedTime);
    HANDLE_ERROR(cudaEventDestroy(start));
    HANDLE_ERROR(cudaEventDestroy(stop));
    return (0);
}

如果我推出了那么多的＆＃34;线程？＆＃34;它工作到大约300k块。但有时相同的数量它不起作用。我收到一个错误：＆＃34;未知错误＆＃34;从行：

gpuErrchk(cudaDeviceSynchronize());

或cudaFree，或内核调用后的一些函数。在这里输入代码

如果我只使用1k块和1k线程启动并使用cuda-memcheck，我会得到与以前相同的错误但没有cuda-memcheck它运行得很好。

我不知道造成这个问题的原因以及如何解决这个问题

注意：HANDLE_ERROR宏可以通过gpuErrchk maccro进行更改，它是一个完全相同的库中的定义

我还想知道如何使用spec＆＃39;来确定我可以启动的最大线程数量。硬件或任何东西。

Answer 1

在使用WDDM驱动程序的Windows上，可以批量启动多个内核，以减少启动开销。由于监视程序计时器适用于整个批处理，因此即使每个内核本身都在所选的超时值内完成，也可以触发超时。

到目前为止，一种廉价的强制立即执行所有内核的方法是调用cudaStreamQuery(0)。与调用cudaDeviceSynchronize()不同，这将立即返回，而不是等待内核完成。

在内核调用之间散布cudaStreamQuery(0)因此可确保WDDM超时仅适用于两个cudaStreamQuery(0)调用之间的内核。

如果即使单个内核需要太长时间并触发看门狗，也可以尝试将其拆分为多个调用，每个调用使用较少的块，然后再次调用cudaStreamQuery(0)。这不仅使看门狗感到高兴，而且还使GUI有些反应。

CUDA内核仅以某些网格大小

1 个答案: