我实际上正在与CUDA合作,并且我尝试使用此技术优化程序。所以我有一个大内核,我必须在100k +时间和100M +时间之间启动可能数十亿?
所以我读到使用dim3变量允许启动那么多线程(cf:https://devtalk.nvidia.com/default/topic/621867/size-limitation-for-1d-arrays-in-cuda-/?offset=7)
我有一个示例代码(在我的gtx970上)运行某段时间,有时候没有。
#ifndef PROPAGATORSAT_CUH_
# define PROPAGATORSAT_CUH
# define M_PI (3.14159265358979323846)
# define TWO_PI (2 * M_PI)
# define TOTAL_TIME (615359.772)
# define STEP (0.771)
# define NB_IT (TOTAL_TIME / (double)STEP)
# define NB_THREADS (1024)
# define NB_BLOCKS (int)((NB_IT + NB_THREADS - 1) / NB_THREADS)
# include <cmath>
# include <cfloat>
# include <stdio.h>
# include "../common/book.h"
# include "cuda_runtime.h"
# include "device_launch_parameters.h"
# define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
class Global
{
public:
const double _ITURadEarth = 6378145.0;
const double _ITUGravCst = 3.986012E5;
const double _ITUJ2 = 0.001082636;
const double _J2000AngleDeg = 0;//-79.8058;
const double _J2000AngleRad = 0;//TO_RAD(_J2000AngleDeg);
const double _ITUAngleRateEarthRot = 4.1780745823E-3;
const double _ITUAngleRateEarthRotRad = degToRad(_ITUAngleRateEarthRot);
public:
__device__ double myAsin(double angle);
__host__ __device__ double myAcos(double angle);
__host__ __device__ double negPiToPi(double angle);
__host__ __device__ double degToRad(double angle);
__device__ double radToDeg(double angle);
};
class Cartesian
{
public:
double _X;
double _Y;
double _Z;
private:
double _m;
public:
__host__ __device__ Cartesian(double x, double y, double z) : _X(x), _Y(y), _Z(z), _m(-1) {}
};
class Propagator
{
public:
double _iDeg;
double _a;
double _omega_0;
double _OMEGA_0;
double _omega_r;
double _OMEGA_r;
double _rho;
double _SinI;
double _CosI;
double _p;
double _e;
double _ReKm;
double _n0;
double _n_bar;
double _M0;
double _sqrt_e;
int _orbitCase = -1;
double _WdeltaRad;
double _precessionRateRad;
double _artificialPrecessionRad = DBL_MIN;
double _simulationDuration = DBL_MIN;
double _incrementWdeltaRad;
void propagator(double smaKm,
double incDeg,
double e,
double raanDeg,
double aopDeg,
double trueAnomalyDeg,
bool stationKeeping,
double WdeltaDeg,
bool precessionMechanismSupplied,
double precessionRateDeg);
__device__ Cartesian evaluate(double timeSec,
double simulationDuration,
double artificialPrecessionRad,
bool ECImode);
__device__ double solveKepler(double M,
double e,
double epsilon);
__device__ Cartesian rotateOrbitalElements(Cartesian pq0,
double omega,
double OMEGA,
double CosI,
double SinI);
};
#endif /* !PROPAGATORSAT_CUH_ */
__host__ __device__ double Global::myAcos(double angle)
{
return (acos(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}
__device__ double Global::myAsin(double angle)
{
return (asin(((angle > 1) ? (1) : (angle < -1) ? (-1) : (angle))));
}
__host__ __device__ double Global::degToRad(double angle)
{
return (angle * M_PI / 180.0);
}
__device__ double Global::radToDeg(double angle)
{
return (angle * 180.0 / M_PI);
}
__host__ __device__ double Global::negPiToPi(double angle)
{
double output;
output = fmod(angle, TWO_PI);
output = fmod(angle + TWO_PI, TWO_PI);
return ((output > M_PI) ? (output - TWO_PI) : (output));
}
void Propagator::propagator(double smaKm, double incDeg, double e, double raanDeg, double aopDeg, double trueAnomalyDeg, bool stationKeeping, double WdeltaDeg, bool precessionMechanismSupplied, double precessionRateDeg)
{
double iRad, trueAnomalyRad, cosV, E, mu;
Global global;
_iDeg = incDeg;
iRad = global.degToRad(_iDeg);
_CosI = cos(iRad);
_SinI = sin(iRad);
_e = e;
_a = smaKm;
trueAnomalyRad = global.degToRad(trueAnomalyDeg);
if (e == 0)
_M0 = trueAnomalyRad;
else
{
cosV = cos(trueAnomalyRad);
E = global.myAcos((e + cosV) / (1 + e * cosV));
if (global.negPiToPi(trueAnomalyRad) < 0)
E = M_PI * 2 - E;
_M0 = E - e * sin(E);
}
_OMEGA_0 = global.degToRad(raanDeg);
_omega_0 = global.degToRad(aopDeg);
_p = _a * (1 - e * e);
_ReKm = global._ITURadEarth / 1000;
mu = global._ITUGravCst;
_n0 = sqrt(mu / pow(_a, 3));
_n_bar = _n0 * (1.0 + 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * (1.0 - 1.5 * pow(_SinI, 2)) * pow(1.0 - pow(e, 2), 0.5));
_OMEGA_r = -1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * _CosI;
_omega_r = 1.5 * global._ITUJ2 * pow(_ReKm, 2) / pow(_p, 2) * _n_bar * (2.0 - 2.5 * pow(_SinI, 2));
_sqrt_e = sqrt((1 + e) / (1 - e));
_WdeltaRad = global.degToRad(WdeltaDeg);
_precessionRateRad = global.degToRad(precessionRateDeg);
if (stationKeeping == false)
_orbitCase = 1;
else if (precessionMechanismSupplied == false)
_orbitCase = 2;
else
_orbitCase = 3;
}
__device__ Cartesian Propagator::rotateOrbitalElements(Cartesian pq0, double omega, double OMEGA, double CosI, double SinI)
{
double CosOMEGA, SinOMEGA, CosOmega, SinOmega, R11, R12, R13, R21, R22, R23, R31, R32, R33, x, y, z;
CosOMEGA = cos(OMEGA);
SinOMEGA = sin(OMEGA);
CosOmega = cos(omega);
SinOmega = sin(omega);
R11 = CosOMEGA * CosOmega - SinOMEGA * SinOmega * CosI;
R12 = -CosOMEGA * SinOmega - SinOMEGA * CosOmega * CosI;
R13 = SinOMEGA * SinI;
R21 = SinOMEGA * CosOmega + CosOMEGA * SinOmega * CosI;
R22 = -SinOMEGA * SinOmega + CosOMEGA * CosOmega * CosI;
R23 = -CosOMEGA * SinI;
R31 = SinOmega * SinI;
R32 = CosOmega * SinI;
R33 = CosI;
x = R11 * pq0._X + R12 * pq0._Y + R13 * pq0._Z;
y = R21 * pq0._X + R22 * pq0._Y + R23 * pq0._Z;
z = R31 * pq0._X + R32 * pq0._Y + R33 * pq0._Z;
Cartesian cart = Cartesian(x, y, z);
return (cart);
}
__device__ Cartesian Propagator::evaluate(double timeSec, double simulationDuration, double artificialPrecessionRad, bool ECImode = true)
{
double M, E, v, cosV, sinV, rotationAngleECF, omega, OMEGA;
Global global;
if (_simulationDuration != simulationDuration || _artificialPrecessionRad != artificialPrecessionRad)
{
_simulationDuration = simulationDuration;
_artificialPrecessionRad = artificialPrecessionRad;
_incrementWdeltaRad = (_WdeltaRad * 2) / _simulationDuration;
}
M = _M0 + ((_orbitCase == 3) ? _n0 : _n_bar) * timeSec;
E = E = (_e == 0) ? M : solveKepler(M, _e, 1e-8);
v = 2.0 * atan(_sqrt_e * tan(E / 2));
cosV = cos(v);
sinV = sin(v);
_rho = _p / (1 + _e * cosV);
rotationAngleECF = (ECImode) ? 0 : -1 * (global._J2000AngleRad + timeSec * global._ITUAngleRateEarthRotRad);
omega = _omega_0 + ((_orbitCase == 3) ? 0 : _omega_r * timeSec);
OMEGA = _OMEGA_0 + rotationAngleECF + ((_orbitCase == 3) ? 0 : _OMEGA_r * timeSec);
if (_orbitCase == 1)
OMEGA += artificialPrecessionRad * timeSec;
else if (_orbitCase == 2)
OMEGA += _WdeltaRad * ((2.0 * timeSec / _simulationDuration) - 1);
else if (_orbitCase == 3)
OMEGA += _precessionRateRad * timeSec - _WdeltaRad + _incrementWdeltaRad * timeSec;
Cartesian pq0 = Cartesian(1000 * _rho * cosV, 1000 * _rho * sinV, 0);
Cartesian positionECI = Propagator::rotateOrbitalElements(pq0, omega, OMEGA, _CosI, _SinI);
return (positionECI);
}
__device__ double Propagator::solveKepler(double M, double e, double epsilon)
{
double En, Ens;
En = M;
Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
while (abs(Ens - En) > epsilon)
{
En = Ens;
Ens = En - (En - e * sin(En) - M) / (1 - e * cos(En));
}
return (Ens);
}
__global__ void kernel(Propagator *CUDA_prop)
{
size_t tid;
tid = (blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z) * blockDim.x + threadIdx.x;
//if (tid < NB_IT)
Cartesian positionNGSOsatECI = CUDA_prop[0].evaluate(STEP * tid, 615359.772, 0);
}
int main(void)
{
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
Propagator prop[1], *CUDA_prop;
dim3 block(1000, 1, 1);
dim3 thread(1024, 1, 1);
prop[0].propagator(7847.3, 53, 0, 18, 0, 67.5, true, 5, true, 3.4000000596279278E-05);
HANDLE_ERROR(cudaMalloc((void **)&CUDA_prop, sizeof(Propagator)));
HANDLE_ERROR(cudaMemcpy(CUDA_prop, prop, sizeof(Propagator), cudaMemcpyHostToDevice));
kernel <<< block, thread >>> (CUDA_prop);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
HANDLE_ERROR(cudaFree(CUDA_prop));
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("time : %f ms\n", elapsedTime);
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
return (0);
}
如果我推出了那么多的&#34;线程?&#34;它工作到大约300k块。但有时相同的数量它不起作用。我收到一个错误:&#34;未知错误&#34;从行:
gpuErrchk(cudaDeviceSynchronize());
或cudaFree,或内核调用后的一些函数。 在这里输入代码
如果我只使用1k块和1k线程启动并使用cuda-memcheck,我会得到与以前相同的错误但没有cuda-memcheck它运行得很好。
我不知道造成这个问题的原因以及如何解决这个问题
注意:HANDLE_ERROR宏可以通过gpuErrchk maccro进行更改,它是一个完全相同的库中的定义
我还想知道如何使用spec&#39;来确定我可以启动的最大线程数量。硬件或任何东西。
答案 0 :(得分:1)
在使用WDDM驱动程序的Windows上,可以批量启动多个内核,以减少启动开销。由于监视程序计时器适用于整个批处理,因此即使每个内核本身都在所选的超时值内完成,也可以触发超时。
到目前为止,一种廉价的强制立即执行所有内核的方法是调用cudaStreamQuery(0)
。与调用cudaDeviceSynchronize()
不同,这将立即返回,而不是等待内核完成。
在内核调用之间散布cudaStreamQuery(0)
因此可确保WDDM超时仅适用于两个cudaStreamQuery(0)
调用之间的内核。
如果即使单个内核需要太长时间并触发看门狗,也可以尝试将其拆分为多个调用,每个调用使用较少的块,然后再次调用cudaStreamQuery(0)
。这不仅使看门狗感到高兴,而且还使GUI有些反应。