Question

我正在尝试了解我正在处理的大型CUDA内核的指令吞吐量。我写了两个小程序来比较加法和移位指令的吞吐量。根据CUDA C编程指南，移位指令的吞吐量是添加指令的一半。但是，当我测量在特斯拉M2070上跟随两个程序的时间时，时间完全相同。有人可以解释为什么会这样吗？

加法计划：

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>

using namespace std;

__global__ void testAdd(int numIterations, uint1* result){
  int total = 1;
  for(int i=0; i< numIterations;i ++){
    total = total+i;
  }
  result[0] = make_uint1(total);
}

int main(){
  uint1* result;
  cudaMalloc((void**)(&(result)), sizeof(uint1));
  float totalElapsedTime = 0;
  int i;

  for(i = 0; i < 10; i++){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    testAdd<<<1,1>>>(100000, result);
    cudaError_t e50 = cudaGetLastError();
    if(e50 == cudaSuccess){
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);
      totalElapsedTime += elapsedTime;;
      //cout << "Elapsed Time:" << elapsedTime << endl;                                                                                                                              

    }else{
      cout << "Error launching kernel: " << e50 << endl;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }
  cout << "Elapsed Time: " << totalElapsedTime/i << endl;
  cudaFree(result);
}

轮班计划：

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>

using namespace std;

__global__ void testShift(int numIterations, uint1* result){
  int total = 1;
  for(int i=0; i< numIterations;i ++){
    total = total<<i;
  }
  result[0] = make_uint1(total);
}

int main(){
  uint1* result;
  cudaMalloc((void**)(&(result)), sizeof(uint1));
  float totalElapsedTime = 0;
  int i;

  for(i = 0; i < 10; i++){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    testShift<<<1,1>>>(100000, result);
    cudaError_t e50 = cudaGetLastError();
    if(e50 == cudaSuccess){
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);
      totalElapsedTime += elapsedTime;;
      //cout << "Elapsed Time:" << elapsedTime << endl;                                                                                                                              

    }else{
      cout << "Error launching kernel: " << e50 << endl;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }
  cout << "Elapsed Time: " << totalElapsedTime/i << endl;
  cudaFree(result);
}

编辑：添加ptx代码添加和转移程序。正如您所看到的那样，唯一的区别在于第78行，即添加指令与shl指令。

添加PTX代码：

        .entry _Z7testAddiP5uint1 (
                .param .s32 __cudaparm__Z7testAddiP5uint1_numIterations,
                .param .u64 __cudaparm__Z7testAddiP5uint1_result)
        {
        .reg .u32 %r<8>;
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
        mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total+i;
        add.s32         %r5, %r4, %r5;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
//   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
        mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total+i;
        add.s32         %r5, %r4, %r5;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
        @%p2 bra        $Lt_0_1794;
        bra.uni         $Lt_0_1282;
$Lt_0_2306:
        mov.s32         %r5, 1;
$Lt_0_1282:
        .loc    16      15      0
 //  14    }
 //  15    result[0] = make_uint1(total);
        ld.param.u64    %rd1, [__cudaparm__Z7testAddiP5uint1_result];
        st.global.u32   [%rd1+0], %r5;
        .loc    16      16      0
 //  16  }
        exit;
$LDWend__Z7testAddiP5uint1:
        } // _Z7testAddiP5uint1

转换PTX代码：

        .entry _Z9testShiftiP5uint1 (
                .param .s32 __cudaparm__Z9testShiftiP5uint1_numIterations,
                .param .u64 __cudaparm__Z9testShiftiP5uint1_result)
        {
        .reg .u32 %r<8>;
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
    mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total<<i;
        shl.b32         %r5, %r5, %r4;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
    mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total<<i;
        shl.b32         %r5, %r5, %r4;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
        @%p2 bra        $Lt_0_1794;
        bra.uni         $Lt_0_1282;
$Lt_0_2306:
        mov.s32         %r5, 1;
$Lt_0_1282:
        .loc    16      15      0
 //  14    }
 //  15    result[0] = make_uint1(total);
        ld.param.u64    %rd1, [__cudaparm__Z9testShiftiP5uint1_result];
        st.global.u32   [%rd1+0], %r5;
        .loc    16      16      0
 //  16  }
        exit;
$LDWend__Z9testShiftiP5uint1:
        } // _Z9testShiftiP5uint1

Answer 1

@gmemon：如果你想检查GPU汇编代码，那么PTX在这里没什么用，因为它是中间语言。

要获取实际的汇编代码，您可以执行以下操作：

使用NVCC -keep选项
使用cuobjdump - 在CUBIN文件上使用-dump-sass进行反汇编

CUBIN文件通常称为foo.sm_20.cubin或foo.sm_30.cubin，具体取决于您的架构。

例如，kepler反汇编看起来如下：

    /*7458*/     /*0x001b9e85c0000000*/     LDL.CS R46, [R1];
    /*7460*/     /*0x101ade85c0000000*/     LDL.CS R43, [R1+0x4];
    /*7468*/     /*0xf2655c85c8000063*/     STL [R38+0x18fc], R21;
    /*7470*/     /*0x3ee35c036800c000*/     LOP.AND R13, R46, 0xf;
    /*7478*/     /*0x400000076000000c*/     SSY 0x7790;
    /*7488*/     /*0xfcdfdd0348010000*/     IADD RZ.CC, R13, -RZ;
    /*7490*/     /*0xfff1dc63190e0000*/     ISETP.EQ.X.AND P0, pt, RZ, RZ, pt;
    /*7498*/     /*0x800001e74000000b*/     @P0 BRA 0x7780;
    /*74a0*/     /*0xfc001de428000000*/     MOV R0, RZ;
    /*74a8*/     /*0x04039de218000000*/     MOV32I R14, 0x1;
    /*74b0*/     /*0x0403dde218000000*/     MOV32I R15, 0x1;
    /*74b8*/     /*0x626fdca5c8000064*/     STL.64 [R38+0x1918], RZ;

指令语义可以在cuobjdump工具手册中找到

Answer 2

我建议查看PTX代码中的说明数量 - 您可以发布两个示例的PTX代码吗？这应该提供关于性能的线索。

不过，我不确定你是否能用一个线程可靠地测试性能。

CUDA添加与移位指令性能

2 个答案: