Question

在尝试加速我的加密内核时，IPC ILP次32-bit int次调查ADD次。内核由相当展开的XOR和160 ops长序列循环组成，在Kepler上每个循环的吞吐量应为192 GTX Titan/780个IPC }}）。

我的内核的

3.28命中ILP的上限。使用IPC甚至删除ILP。显然ILP 4无法帮助实现我的目标 - 充分利用管道，所以我写了一些小实验。我把cubin的代码放在最后。

Profiler测量

结果在GTX Titan上测量。
Executed IPC输出以确保在优化期间不会消除任何指令。
IPC与发布的XOR几乎相同，所以我只列出其中一个。

ADD说明（| ILP 1 | ILP 2 | ILP 4 | ILP 8 -------------------------------------------------- IPC | 4.00 | 3.32 | 2.72 | 3.44 -------------------------------------------------- Issue Slot | 99.17% | 59.34% | 48.61% | 61.71% Utilization | | | |具有相同的行为）

ILP

我希望2 4，8和160可以提供更好的效果，但不会。
召回整数吞吐量为4。每个SM的5 warp调度程序应该是每个周期最多IPC个指令的双重问题，因此5应该朝IPC = 4方向发展。我如何解释我观察到的内容？为什么在ILP 4？

Float / Int ADD指令组合

如果我修改int的代码来执行两个ADD float和两个ADD IPC: 5.1 Issue slot utilization: 99.12% s：

ILP

奇怪的是，似乎warp调度程序在发出浮动操作方面做得更好。

讨论

现有文献建议使用ILP帮助达到浮点运算的最高性能。为什么2.25不适用于整数？如何对整数运算执行此操作？
我的内核理论上应该对每个候选人进行cuobjdump整数运算。这与我在2^48中观察到的一致。有2.25 * 2^48 / (2688 * 160/192) / 876 MHz = 322.75s个候选者，因此GTX Titan上的最小运行时应为523s。这个估计合理吗？
我内核的测量性能为160 * 3.28 (measure IPC) / 5 (max IPC)。这确实意味着整数吞吐量仅为__device__ int x[10]; __global__ void test(int flag = 0) { int a = x[0], b = x[1], c = x[2], d = x[3]; int _a = x[4], _b = x[5], _c = x[6], _d = x[7]; #pragma unroll 128 for (int i = 0; i < 51200; ++i) { asm volatile("add.u32 %0, %0, %1;": "+r"(a): "r"(_a)); asm volatile("add.u32 %0, %0, %1;": "+r"(b): "r"(_b)); asm volatile("add.u32 %0, %0, %1;": "+r"(c): "r"(_c)); asm volatile("add.u32 %0, %0, %1;": "+r"(d): "r"(_d)); } int v = a + b + c + d; if (flag * v == 1) x[0] = v; }。

ILP测试代码

9 / 4 = 2.25

4名候选人的代码片段

每位候选人都需要Cuobjdump次操作。 d ^= d2(1, 3); // d2 is located in constant memory s ^= d; t ^= d2(1, 16); u ^= d2(1, 17); v ^= some_const; flag_s = min(flag_s, s); // int min has throughput of 160 flag_t = flag_t || (s == t); // setp.or should be the same flag_u = flag_u || (s == u); flag_v = flag_v || (s == v);也会对此进行验证。

{{1}}

Answer 1

我正在提供一个从未答复的清单中删除此问题的答案。

我发现executed Instructions Per Count（IPC）与Instruction Level Parallelism的变化不一致。总的来说，很难在不知道任何进一步信息的情况下争论OP所观察到的效果的原因，而是由OP自己提供（f.i.，发射配置）。

在下面的代码中，我正在考虑使用float的示例，尽管我已使用int测试了相同的代码而未更改概念结果。该代码使用Multiply Add，MAD和ILP=1实现了周期性ILP=2（ILP=4）操作。

executed IPC以下是

ILP         IPC            FLOPs
1           3.924          67108864
2           4.323          67108864
4           4.016          67108864

代表N=8192。代码已使用CUDA 8.0进行编译，并在NVIDIA GT920M上运行。可以看出，对于IPC的不同考虑值，ILP几乎保持不变。假设Floating Point Operations FLOP每2的代码估算的FLOP（MAD s）与Visual Profiler测量的结果一致。

代码

#include<stdio.h>

#define N_ITERATIONS 8192

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCKSIZE   512

//#define DEBUG

/********************************************************/
/* KERNEL0 - NO INSTRUCTION LEVEL PARALLELISM (ILP = 0) */
/********************************************************/
__global__ void kernel0(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        float a = d_a[tid];
        float b = d_b[tid];
        float c = d_c[tid];

        for (unsigned int i = 0; i < N_ITERATIONS; i++) {
            a = a * b + c;
        }

        d_a[tid] = a;
    }

}

/*****************************************************/
/* KERNEL1 - INSTRUCTION LEVEL PARALLELISM (ILP = 2) */
/*****************************************************/
__global__ void kernel1(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N / 2) {

        float a1 = d_a[tid];
        float b1 = d_b[tid];
        float c1 = d_c[tid];

        float a2 = d_a[tid + N / 2];
        float b2 = d_b[tid + N / 2];
        float c2 = d_c[tid + N / 2];

        for (unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
        }

        d_a[tid] = a1;
        d_a[tid + N / 2] = a2;
    }

}

/*****************************************************/
/* KERNEL2 - INSTRUCTION LEVEL PARALLELISM (ILP = 4) */
/*****************************************************/
__global__ void kernel2(float * __restrict__ d_a, const float * __restrict__ d_b, const float * __restrict__ d_c, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N / 4) {

        float a1 = d_a[tid];
        float b1 = d_b[tid];
        float c1 = d_c[tid];

        float a2 = d_a[tid + N / 4];
        float b2 = d_b[tid + N / 4];
        float c2 = d_c[tid + N / 4];

        float a3 = d_a[tid + N / 2];
        float b3 = d_b[tid + N / 2];
        float c3 = d_c[tid + N / 2];

        float a4 = d_a[tid + 3 * N / 4];
        float b4 = d_b[tid + 3 * N / 4];
        float c4 = d_c[tid + 3 * N / 4];

        for (unsigned int i = 0; i < N_ITERATIONS; i++) {
            a1 = a1 * b1 + c1;
            a2 = a2 * b2 + c2;
            a3 = a3 * b3 + c3;
            a4 = a4 * b4 + c4;
        }

        d_a[tid] = a1;
        d_a[tid + N / 4] = a2;
        d_a[tid + N / 2] = a3;
        d_a[tid + 3 * N / 4] = a4;
    }

}

/********/
/* MAIN */
/********/
int main() {

    //const int N = 8192 * 64;
    const int N = 8192;
    //const int N = 1024;

    TimingGPU timerGPU;

    float *h_a = (float*)malloc(N*sizeof(float));
    float *h_a_result_host = (float*)malloc(N*sizeof(float));
    float *h_a_result_device = (float*)malloc(N*sizeof(float));
    float *h_b = (float*)malloc(N*sizeof(float));
    float *h_c = (float*)malloc(N*sizeof(float));

    for (int i = 0; i<N; i++) {
        h_a[i] = 2.;
        h_b[i] = 1.;
        h_c[i] = 2.;
        h_a_result_host[i] = h_a[i];
        for (unsigned int k = 0; k < N_ITERATIONS; k++) {
            h_a_result_host[i] = h_a_result_host[i] * h_b[i] + h_c[i];
        }
    }

    float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, N*sizeof(float)));
    float *d_b; gpuErrchk(cudaMalloc((void**)&d_b, N*sizeof(float)));
    float *d_c; gpuErrchk(cudaMalloc((void**)&d_c, N*sizeof(float)));

    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_c, h_c, N*sizeof(float), cudaMemcpyHostToDevice));

    /***********/
    /* KERNEL0 */
    /***********/
    timerGPU.StartCounter();
    kernel0 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    // --- Remember: timing is in ms
    printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
    for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }

    /***********/
    /* KERNEL1 */
    /***********/
    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
    timerGPU.StartCounter();
    kernel1 << <iDivUp(N / 2, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    // --- Remember: timing is in ms
    printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
    for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }

    /***********/
    /* KERNEL2 */
    /***********/
    gpuErrchk(cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice));
    timerGPU.StartCounter();
    kernel2 << <iDivUp(N / 4, BLOCKSIZE), BLOCKSIZE >> >(d_a, d_b, d_c, N);
#ifdef DEBUG
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
#endif
    // --- Remember: timing is in ms
    printf("Number of operations = %f; GFlops = %f\n", (float)N*(float)N_ITERATIONS, (1.e-6)*((float)N*(float)N_ITERATIONS) / timerGPU.GetCounter());
    gpuErrchk(cudaMemcpy(h_a_result_device, d_a, N*sizeof(float), cudaMemcpyDeviceToHost));
    for (int i = 0; i<N; i++) if (h_a_result_device[i] != h_a_result_host[i]) { printf("Error at i=%i! Host = %f; Device = %f\n", i, h_a_result_host[i], h_a_result_device[i]); return 1; }

    cudaDeviceReset();

    return 0;

}

CUDA中的每计数指令（IPC）和指令级并行（ILP）

Profiler测量

讨论

ILP测试代码

4名候选人的代码片段

1 个答案: