NVIDIA CUDA编译器自动循环展开

时间:2013-07-03 11:12:05

标签: cuda

我不知道NVCC是否足够智能,可以在这样的循环中自动显示指令级并行(ILP):

for (int i = 0; i < 8; i++) {
   if (somethingHappens) {
       someVar = someVar & 1 << i;
   }
}

或者我应该重写它以明确地公开ILP:

char somevar[8];
for (int i = 0; i < 8; i++) {
       if (somethingHappens) {
           someVar[i] = 1 << i;
       }
    }
//reduce somevar using vaddus4 and 3 logical-ands

其他问题:

  • 开普勒的算术管道有多深?
  • 我怎样才能有效地采取措施来了解这种优化是否值得呢?在块之前和块之后读取时钟寄存器是否足够?

1 个答案:

答案 0 :(得分:5)

为了回答你的问题,我考虑了四个不同的内核,其中每个线程在for次迭代中执行n_loop循环。这四个内核实现了四种不同的可能情况:

  1. 迭代次数n_loop在编译时已知;
  2. 迭代次数n_loop在编译时是已知的,求和是有条件的;
  3. 迭代次数n_loop在运行时已知;
  4. 在运行时已知迭代次数n_loop,并执行手动循环展开。
  5. 完整代码如下:

    #include <stdio.h>
    #include <time.h>
    
    #define BLOCKSIZE 512
    
    #define epsilon 0.5
    #define n_loop  8
    
    /**********/
    /* iDivUp */
    /**********/
    int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    /****************************************************/
    /* KERNEL #1: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
    /****************************************************/
    __global__ void testKernel1(float* input, float* output, int N) {
    
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
    
            float accum = 0.f;
    
            for (int i = 0; i < n_loop; i++) { 
                accum = accum + input[n_loop*tid+i];
            }
    
            output[tid] = accum;
    
        }
    
    }
    
    /****************************************************/
    /* KERNEL #2: NUMBER OF LOOPS KNOWN AT COMPILE-TIME */
    /****************************************************/
    __global__ void testKernel2(float* input, float* output, int N) {
    
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
    
            float accum = 0.f;
    
            for (int i = 0; i < n_loop; i++) if (input[n_loop*tid+i] < epsilon) accum = accum + input[n_loop*tid+i];
    
            output[tid] = accum;
    
        }
    
    }
    
    /************************************************/
    /* KERNEL #3: NUMBER OF LOOPS KNOWN AT RUN-TIME */
    /************************************************/
    __global__ void testKernel3(float* input, float* output, int N_loop, int N) {
    
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
    
            float accum = 0.f;
    
            for (int i = 0; i < N_loop; i++) accum = accum + input[N_loop*tid+i];
    
            output[tid] = accum;
    
        }
    
    }
    
    /*******************************************************************/
    /* KERNEL #4: NUMBER OF LOOPS KNOWN AT RUN-TIME - LOOP UNROLL OF 4 */
    /*******************************************************************/
    __global__ void testKernel4(float* input, float* output, int N_loop, int N) {
    
        int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
        if (tid < N) {
    
            float accum1 = 0.f;
            float accum2 = 0.f;
            float accum3 = 0.f;
            float accum4 = 0.f;
    
            for (int i = 0; i < N_loop/4; i++) {
                accum1 = accum1 + input[N_loop*tid+i];
                accum2 = accum2 + input[N_loop*tid+i+N_loop/4];
                accum3 = accum3 + input[N_loop*tid+i+2*N_loop/4];
                accum4 = accum4 + input[N_loop*tid+i+3*N_loop/4];
            }
    
            output[tid] = accum1 + accum2 + accum3 + accum4;
    
        }
    
    }
    
    int main() {
    
        const int N = 512*512*32;
    
        float* input    = (float*) malloc(n_loop*N*sizeof(float));
        float* output   = (float*) malloc(N*sizeof(float));
        float* output2  = (float*) malloc(N*sizeof(float));
        float* outputif = (float*) malloc(N*sizeof(float));
    
        float* d_input;     gpuErrchk(cudaMalloc((void**)&d_input, n_loop*N*sizeof(float)));
        float* d_output;    gpuErrchk(cudaMalloc((void**)&d_output, N*sizeof(float)));
    
        srand(time(NULL));
        for (int i=0; i<n_loop*N; i++) input[i] = rand() / (float)RAND_MAX; 
    
        gpuErrchk(cudaMemcpy(d_input, input, n_loop*N*sizeof(float), cudaMemcpyHostToDevice));
    
        // --- Host-side computations
        for (int k = 0; k < N; k++) {
            float accum1 = 0.f;
            float accum2 = 0.f;
            for (int i = 0; i < n_loop; i++) {
                accum1 = accum1 + input[n_loop*k+i];
                if (input[n_loop*k+i] < epsilon) accum2 = accum2 + input[n_loop*k+i];
            }
            output[k] = accum1;
            outputif[k] = accum2;
        }
    
        // --- Device-side computation - kernel1
        float time;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
    
        testKernel1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Kernel1 elapsed time:  %3.4f ms \n", time);
    
        gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Check CPU and GPU results
        for (int i=0; i<N; i++)
            if (output[i] != output2[i]) {
                printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
                return 1;
            }
        printf("kernel1: results match!\n");
    
        // --- Device-side computation - kernel2
        cudaEventRecord(start, 0);
    
        testKernel2<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Kernel1 elapsed time:  %3.4f ms \n", time);
    
        gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Check CPU and GPU results
        for (int i=0; i<N; i++)
            if (outputif[i] != output2[i]) {
                printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, outputif[i], output2[i]);
                return 1;
            }
        printf("kernel2: results match!\n");
    
        // --- Device-side computation - kernel3
        cudaEventRecord(start, 0);
    
        testKernel3<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Kernel3 elapsed time:  %3.4f ms \n", time);
    
        gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Check CPU and GPU results
        for (int i=0; i<N; i++)
            if (output[i] != output2[i]) {
                printf("Mismatch at i = %d, Host= %f, Device = %f\n", i, output[i], output2[i]);
                return 1;
            }
        printf("kernel3: results match!\n");
    
        // --- Device-side computation - kernel4
        cudaEventRecord(start, 0);
    
        testKernel4<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_input, d_output, n_loop, N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Kernel4 elapsed time:  %3.4f ms \n", time);
    
        gpuErrchk(cudaMemcpy(output2, d_output, N*sizeof(float), cudaMemcpyDeviceToHost));
    
        // --- Check CPU and GPU results
        for (int i=0; i<N; i++)
            if (abs(output[i] - output2[i]) > 0.0001) {
                printf("Mismatch at i = %d, Host= %f, Device = %f, difference = %f\n", i, output[i], output2[i], output2[i] - output[i]);
                return 1;
                }
        printf("kernel4: results match!\n");
    
        return 0;
    
    }
    

    现在让我们分析四种不同情况下的反汇编代码(用CUDA 6.0编译)。我考虑为Fermi架构编译。

    KERNEL 1

         MOV R1, c[0x1][0x100];
         S2R R0, SR_CTAID.X;
         IMUL R2, R0, c[0x0][0x8];
         S2R R3, SR_TID.X;
         IADD R0, R2, R3;
         ISETP.GE.AND P0, PT, R0, c[0x0][0x28], PT;
     @P0 BRA.U 0xd8;
    @!P0 IADD R2, R3, R2;
    @!P0 ISCADD R2, R2, c[0x0][0x20], 0x5; 
    @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;
    @!P0 LD R9, [R2];
    @!P0 LD R8, [R2+0x4];
    @!P0 LD R7, [R2+0x8];
    @!P0 LD R6, [R2+0xc];
    @!P0 LD R5, [R2+0x10];
    @!P0 LD R4, [R2+0x14];
    @!P0 LD R3, [R2+0x18];
    @!P0 LD R2, [R2+0x1c];
    @!P0 F2F.F32.F32 R9, R9;
    @!P0 FADD R8, R9, R8;
    @!P0 FADD R7, R8, R7;
    @!P0 FADD R6, R7, R6;
    @!P0 FADD R5, R6, R5;
    @!P0 FADD R4, R5, R4;
    @!P0 FADD R3, R4, R3;
    @!P0 FADD R2, R3, R2;
    @!P0 ST [R0], R2;
         EXIT;
    

    在这种情况下,编译器完全展开循环。您会看到8不同的加载(LD)说明和7不同的加载(FADD)说明。

    KERNEL 2

        MOV R1, c[0x1][0x100];
        S2R R0, SR_CTAID.X;
        IMUL R0, R0, c[0x0][0x8];
        S2R R2, SR_TID.X;
        IADD R3, R0, R2;
        ISETP.GE.AND P0, PT, R3, c[0x0][0x28], PT;
    @P0 EXIT;
        IADD R0, R2, R0;
        ISCADD R9, R0, c[0x0][0x20], 0x5;
        LD R0, [R9];
        LD R2, [R9+0x4];
        LD R4, [R9+0x8];
        LD R5, [R9+0xc];
        LD R6, [R9+0x10];
        LD R7, [R9+0x14];
        LD R8, [R9+0x18];
        LD R9, [R9+0x1c];
        FSETP.LT.AND P0, PT, R0, 0.5, PT;
        FSETP.LT.AND P1, PT, R4, 0.5, PT;
        F2F.F32.F32 R0, R0;
        SEL R0, R0, RZ, P0;
        FSETP.LT.AND P0, PT, R2, 0.5, PT;
    @P0 FADD R0, R0, R2;
        FSETP.LT.AND P0, PT, R5, 0.5, PT;
    @P1 FADD R0, R0, R4;
    @P0 FADD R0, R0, R5;
        FSETP.LT.AND P1, PT, R8, 0.5, PT;
        FSETP.LT.AND P0, PT, R6, 0.5, PT;
        FADD R2, R0, R6;
        SEL R2, R2, R0, P0;
        FSETP.LT.AND P0, PT, R7, 0.5, PT;
        ISCADD R0, R3, c[0x0][0x24], 0x2;
    @P0 FADD R2, R2, R7;
        FSETP.LT.AND P0, PT, R9, 0.5, PT;
    @P1 FADD R2, R2, R8;
    @P0 FADD R2, R2, R9;
        ST [R0], R2;
        EXIT;
    

    同样在这种情况下,编译器完全展开循环。您将再次看到8不同的加载(LD)说明和7不同的加载(FADD)说明。

    KERNEL 3

    c[0x0][0x30]    = N
    c[0x1][0x100]   = BLOCKSIZE
    c[0x0][0x8]     = blockDim.x
    c[0x0][0x30]    = N_loop
    c[0x0][0x20]    = input
    
    /*0000*/         MOV R1, c[0x1][0x100];                           R1 = BLOCKSIZE = 256
    /*0008*/         S2R R0, SR_CTAID.X;                              R0 = blockIdx.x
    /*0010*/         S2R R2, SR_TID.X;                                R2 = threadIdx.x
    /*0018*/         IMAD R0, R0, c[0x0][0x8], R2;                    R0 = tid = blockIDx.x * blockDim.x + threadIdx.x
    /*0020*/         ISETP.GE.AND P0, PT, R0, c[0x0][0x34], PT;       P0 = (tid >= N) then EXIT
    /*0028*/     @P0 EXIT;
    /*0030*/         ISETP.LT.AND P0, PT, RZ, c[0x0][0x30], PT;       P0 = (0 < N_loop)
    /*0038*/     @P0 BRA 0x60;
    /*0040*/         MOV R4, RZ;
    /*0048*/         BRA 0x170;
    /*0050*/         NOP;
    /*0058*/         NOP;
    /*0060*/         MOV R2, c[0x0][0x30];                            R2 = N_loop
    /*0068*/         IMUL R3, R0, c[0x0][0x30];                       R3 = tid * N_loop
    /*0070*/         MOV32I R6, 0x4;                                  R6 = sizeof(float) = 4
    /*0078*/         ISETP.GT.AND P0, PT, R2, 0x3, PT;                P0 = (R2 >= 3)
    /*0080*/         IMAD R2.CC, R3, R6, c[0x0][0x20];                R2 = R3 * R6 + input = tid * N_loop * 4 + input
    /*0088*/         MOV R4, RZ;                                      R4 = 0
    /*0090*/         MOV R5, RZ;                                      R5 = 0
    /*0098*/         IMAD.HI.X R3, R3, R6, c[0x0][0x24];              
    /*00a0*/    @!P0 BRA 0x128;                               
    /*00a8*/         MOV R6, c[0x0][0x30];                            R6 = N_loop
    /*00b0*/         IADD R10, R6, -0x3;                              R10 = N_loop - 3
    /*00b8*/         NOP;
    /*00c0*/         IADD R5, R5, 0x4;                                R5 = R5 + 4 = 4                              
    /*00c8*/         LD.E R6, [R2];                                   R6 = input[tid * N_loop]
    /*00d0*/         ISETP.LT.AND P0, PT, R5, R10, PT;                P0 = (4 < (N_loop - 3))
    /*00d8*/         LD.E R7, [R2+0x4];                               R7 = input[tid * N_loop + 1]
    /*00e0*/         LD.E R8, [R2+0x8];                               R8 = input[tid * N_loop + 2]
    /*00e8*/         LD.E R9, [R2+0xc];                               R9 = input[tid * N_loop + 3]
    /*00f0*/         IADD R2.CC, R2, 0x10;                            R2 = R2 + 16 = R2 + 4 * sizeof(float)
    /*00f8*/         IADD.X R3, R3, RZ;                               
    /*0100*/         FADD R6, R4, R6;                                 R6 = 0 + input[tid * N_loop]
    /*0108*/         FADD R4, R6, R7;                                 R4 = input[tid * N_loop] + input[tid * N_loop + 1]
    /*0110*/         FADD R8, R4, R8;                                 R8 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2]
    /*0118*/         FADD R4, R8, R9;                                 R4 = input[tid * N_loop] + input[tid * N_loop + 1] + input[tid * N_loop + 2] + input[tid * N_loop + 3]
    /*0120*/     @P0 BRA 0xc0;                                        ...
    /*0128*/         ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
    /*0130*/    @!P0 BRA 0x170;
    /*0138*/         IADD R5, R5, 0x1;
    /*0140*/         LD.E R6, [R2];
    /*0148*/         ISETP.LT.AND P0, PT, R5, c[0x0][0x30], PT;
    /*0150*/         IADD R2.CC, R2, 0x4;
    /*0158*/         IADD.X R3, R3, RZ;
    /*0160*/         FADD R4, R4, R6;
    /*0168*/     @P0 BRA 0x138;
    /*0170*/         MOV32I R3, 0x4;
    /*0178*/         IMAD R2.CC, R0, R3, c[0x0][0x28];
    /*0180*/         IMAD.HI.X R3, R0, R3, c[0x0][0x2c];
    /*0188*/         ST.E [R2], R4;
    /*0190*/         EXIT;
    

    可以看出,编译器自动执行4的循环展开,因为我看到4加载操作(LD)和3不同的添加({ {1}})

    KERNEL 4

    FADD

    在这种情况下,编译器会自动执行/*0000*/ MOV R1, c[0x1][0x100]; /*0008*/ S2R R0, SR_CTAID.X; /*0010*/ S2R R2, SR_TID.X; /*0018*/ IMAD R13, R0, c[0x0][0x8], R2; /*0020*/ ISETP.GE.AND P0, PT, R13, c[0x0][0x34], PT; /*0028*/ @P0 EXIT; /*0030*/ MOV R2, c[0x0][0x30]; /*0038*/ SHR R0, R2, 0x1f; /*0040*/ ISETP.GT.AND P0, PT, R2, 0x3, PT; /*0048*/ IMAD.U32.U32.HI R0, R0, 0x4, R2; /*0050*/ SHR R0, R0, 0x2; /*0058*/ @P0 BRA 0x98; /*0060*/ MOV R18, RZ; /*0068*/ MOV R19, RZ; /*0070*/ MOV R10, RZ; /*0078*/ MOV R11, RZ; /*0080*/ BRA 0x308; /*0088*/ NOP; /*0090*/ NOP; /*0098*/ MOV R3, c[0x0][0x30]; /*00a0*/ IMUL R4, R13, c[0x0][0x30]; /*00a8*/ MOV32I R5, 0x4; /*00b0*/ IMUL R2, R3, 0x3; /*00b8*/ SHL R6, R3, 0x1; /*00c0*/ IADD R10, R0, R4; /*00c8*/ SHR R3, R2, 0x1f; /*00d0*/ IMAD R8.CC, R4, R5, c[0x0][0x20]; /*00d8*/ SHR R7, R6, 0x1f; /*00e0*/ IMAD.U32.U32.HI R2, R3, 0x4, R2; /*00e8*/ IMAD.HI.X R9, R4, R5, c[0x0][0x24]; /*00f0*/ IMAD.U32.U32.HI R7, R7, 0x4, R6; /*00f8*/ IMAD.HI R3, R2, c[0x10][0x0], R4; /*0100*/ IMAD R6.CC, R10, R5, c[0x0][0x20]; /*0108*/ ISETP.GT.AND P0, PT, R0, 0x1, PT; /*0110*/ IMAD.HI R14, R7, c[0x10][0x0], R4; /*0118*/ MOV R18, RZ; /*0120*/ IMAD.HI.X R7, R10, R5, c[0x0][0x24]; /*0128*/ MOV R19, RZ; /*0130*/ IMAD R2.CC, R3, R5, c[0x0][0x20]; /*0138*/ MOV R10, RZ; /*0140*/ IMAD.HI.X R3, R3, R5, c[0x0][0x24]; /*0148*/ MOV R11, RZ; /*0150*/ IMAD R4.CC, R14, R5, c[0x0][0x20]; /*0158*/ MOV R12, RZ; /*0160*/ IMAD.HI.X R5, R14, R5, c[0x0][0x24]; /*0168*/ @!P0 BRA 0x260; /*0170*/ IADD R16, R0, -0x1; /*0178*/ NOP; /*0180*/ IADD R12, R12, 0x2; /*0188*/ LD.E R15, [R8]; /*0190*/ ISETP.LT.AND P0, PT, R12, R16, PT; /*0198*/ LD.E R20, [R6]; /*01a0*/ FADD R17, R18, R15; /*01a8*/ LD.E R14, [R4]; /*01b0*/ FADD R19, R19, R20; /*01b8*/ LD.E R15, [R2]; /*01c0*/ LD.E R18, [R8+0x4]; /*01c8*/ LD.E R20, [R6+0x4]; /*01d0*/ IADD R6.CC, R6, 0x8; /*01d8*/ NOP; /*01e0*/ FADD R14, R10, R14; /*01e8*/ FADD R15, R11, R15; /*01f0*/ IADD.X R7, R7, RZ; /*01f8*/ LD.E R10, [R4+0x4]; /*0200*/ IADD R4.CC, R4, 0x8; /*0208*/ LD.E R11, [R2+0x4]; /*0210*/ IADD.X R5, R5, RZ; /*0218*/ FADD R18, R17, R18; /*0220*/ IADD R2.CC, R2, 0x8; /*0228*/ FADD R19, R19, R20; /*0230*/ IADD.X R3, R3, RZ; /*0238*/ IADD R8.CC, R8, 0x8; /*0240*/ IADD.X R9, R9, RZ; /*0248*/ FADD R10, R14, R10; /*0250*/ FADD R11, R15, R11; /*0258*/ @P0 BRA 0x180; /*0260*/ ISETP.LT.AND P0, PT, R12, R0, PT; /*0268*/ @!P0 BRA 0x308; /*0270*/ IADD R12, R12, 0x1; /*0278*/ LD.E R17, [R8]; /*0280*/ ISETP.LT.AND P0, PT, R12, R0, PT; /*0288*/ LD.E R16, [R6]; /*0290*/ IADD R6.CC, R6, 0x4; /*0298*/ LD.E R15, [R4]; /*02a0*/ IADD.X R7, R7, RZ; /*02a8*/ LD.E R14, [R2]; /*02b0*/ IADD R4.CC, R4, 0x4; /*02b8*/ IADD.X R5, R5, RZ; /*02c0*/ IADD R2.CC, R2, 0x4; /*02c8*/ IADD.X R3, R3, RZ; /*02d0*/ IADD R8.CC, R8, 0x4; /*02d8*/ IADD.X R9, R9, RZ; /*02e0*/ FADD R18, R18, R17; /*02e8*/ FADD R19, R19, R16; /*02f0*/ FADD R10, R10, R15; /*02f8*/ FADD R11, R11, R14; /*0300*/ @P0 BRA 0x270; /*0308*/ FADD R0, R18, R19; /*0310*/ MOV32I R3, 0x4; /*0318*/ FADD R0, R0, R10; /*0320*/ IMAD R2.CC, R13, R3, c[0x0][0x28]; /*0328*/ FADD R0, R0, R11; /*0330*/ IMAD.HI.X R3, R13, R3, c[0x0][0x2c]; /*0338*/ ST.E [R2], R0; /*0340*/ EXIT; 的循环展开,这会叠加到4的手动循环展开,因为我看到4加载操作({{1} })和8不同的添加(LD)。

    尽管反汇编代码与Fermi架构的代码不同,但编译器的行为与Kepler架构类似。

    由于自动循环展开功能,不同内核之间的性能差异不大:

    7

    我没有明确地为Fermi架构提供结果,但是对于四个被考虑的内核,时序大致相同。