Question

我正在研究CUDA中的算法，并希望了解内核的性能，以便我可以适当地优化它。

我需要确定我的内核是仅使用源代码修改进行计算绑定还是内存绑定？ NVIDIA文档建议我在没有内存访问的情况下运行内核来确定计算时间，并且类似地运行内核而无需任何计算来确定内存时间。

我不知道如何正确修改我的源代码，以便我可以实现上述目标？如何在没有内存访问的情况下执行计算（或者如何在不访问存储在内存中的变量的情况下计算结果？）。你可以在下面的代码中建议一个关于内存和计算案例的例子，这样我就可以自己完全修改它了......

__device__ inline float cndGPU(float d)
{
const float       A1 = 0.31938153f;
const float       A2 = -0.356563782f;
const float       A3 = 1.781477937f;
const float       A4 = -1.821255978f;
const float       A5 = 1.330274429f;
const float RSQRT2PI = 0.39894228040143267793994605993438f;

float
K = 1.0f / (1.0f + 0.2316419f * fabsf(d));

float
cnd = RSQRT2PI * __expf(- 0.5f * d * d) *
      (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));

if (d > 0)
    cnd = 1.0f - cnd;

return cnd;
}

__device__ inline void BlackScholesBodyGPU(
float &CallResult,
float &PutResult,
float S, //Stock price
float X, //Option strike
float T, //Option years
float R, //Riskless rate
float V  //Volatility rate
)
{
float sqrtT, expRT;
float d1, d2, CNDD1, CNDD2;

sqrtT = sqrtf(T);
d1 = (__logf(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
d2 = d1 - V * sqrtT;

CNDD1 = cndGPU(d1);
CNDD2 = cndGPU(d2);

//Calculate Call and Put simultaneously
expRT = __expf(- R * T);
CallResult = S * CNDD1 - X * expRT * CNDD2;
PutResult  = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
}

Answer 1

我是怎么看的。如果你有：

float cndGPU(float d) {
    const float a = 1;
    const float b = 2;
    float c;

    c = a + b + arr[d];

    return c;
}

检查没有内存访问的计算时间 - 将所有计算表达式写入一个并且不使用变量：

return 1 + 2 + 3; //just put some number that can be in arr[d]

检查内存访问 - 字面意思相反：

`

const float a = 1;
const float b = 2;
float c;

c = arr[d]; //here we have our memory access 

return c;

如何评估CUDA内核的内存时间和计算时间？

1 个答案: