我编译了流程简单的测试内核(CUDA5,sm2.0):
__device__ void TestKernel(int *pdata)
{
int a0,b0,c0;
a0 = pdata[0];
b0 = pdata[1];
c0 = a0 + b0;
pdata[2] = c0;
}
并期待像流水汇编程序这样的东西:
LD R3,[R0]
LD R4,[R0+4]
IADD R4,R4,R3
ST [R0+8],R4
但是从cuobjdump --dump-sass我看到了更长的结果:
/*0000*/ /*0x10001de428000000*/ MOV R0, R4;
/*0008*/ /*0x00001de428000000*/ MOV R0, R0;
/*0010*/ /*0x00001de428000000*/ MOV R0, R0;
/*0018*/ /*0x00001de428000000*/ MOV R0, R0;
/*0020*/ /*0x0000dc8580000000*/ LD R3, [R0];
/*0028*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0030*/ /*0x10011c034800c000*/ IADD R4, R0, 0x4;
/*0038*/ /*0x10011de428000000*/ MOV R4, R4;
/*0040*/ /*0x00411c8580000000*/ LD R4, [R4];
/*0048*/ /*0x10011de428000000*/ MOV R4, R4;
/*0050*/ /*0x1030dc0348000000*/ IADD R3, R3, R4;
/*0058*/ /*0x20001c034800c000*/ IADD R0, R0, 0x8;
/*0060*/ /*0x00001de428000000*/ MOV R0, R0;
/*0068*/ /*0x0000dc8590000000*/ ST [R0], R3;
/*0070*/ /*0x00001de790000000*/ RET;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x00001de780000000*/ EXIT;
对我来说很奇怪MOVs在地址8,10,18,28,38,60中的指令 还没有使用加载/存储指令中的立即偏移量。 所以相反,预期4(实际上是6包括RET,EXIT)指令我得到15 可能的原因是什么?
答案 0 :(得分:3)
您所看到的几乎可以肯定是因为您正在编译打开调试。如果我构建你的内核,我会得到这个:
$ nvcc -arch=sm_30 -c asmprob.cu
$ cuobjdump -sass asmprob.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
identifier = asmprob.cu
code for sm_30
Function : _Z10TestKernelPi
/*0008*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44];
/*0010*/ /*0x00009de428004005*/ MOV R2, c [0x0] [0x140];
/*0018*/ /*0x10211c034800c000*/ IADD R4, R2, 0x4;
/*0020*/ /*0x20209c034800c000*/ IADD R2, R2, 0x8;
/*0028*/ /*0x0040dc8580000000*/ LD R3, [R4];
/*0030*/ /*0xf0401c8583ffffff*/ LD R0, [R4+-0x4];
/*0038*/ /*0x00301c0348000000*/ IADD R0, R3, R0;
/*0048*/ /*0x00201c8590000000*/ ST [R2], R0;
/*0050*/ /*0x00001de780000000*/ EXIT;
/*0058*/ /*0xe0001de74003ffff*/ BRA 0x58;
/*0060*/ /*0x00001de440000000*/ NOP CC.T;
/*0068*/ /*0x00001de440000000*/ NOP CC.T;
/*0070*/ /*0x00001de440000000*/ NOP CC.T;
/*0078*/ /*0x00001de440000000*/ NOP CC.T;
.................................
另一方面,如果我使用调试设置构建它,我会像你显示的那样得到代码:
$ nvcc -arch=sm_30 -G -c asmprob.cu
$ cuobjdump -sass asmprob.o
Fatbin elf code:
================
arch = sm_30
code version = [1,6]
producer = cuda
host = mac
compile_size = 32bit
has debug info
compressed
identifier = asmprob.cu
code for sm_30
Function : _Z10TestKernelPi
/*0000*/ /*0x10005de428004001*/ MOV R1, c [0x0] [0x44];
/*0008*/ /*0x00001de218000005*/ MOV32I R0, 0x140;
/*0010*/ /*0x00001c8614000000*/ LDC R0, c [0x0] [R0];
/*0018*/ /*0x00001de428000000*/ MOV R0, R0;
/*0020*/ /*0x00009c8580000000*/ LD R2, [R0];
/*0028*/ /*0x08009de428000000*/ MOV R2, R2;
/*0030*/ /*0x1000dc034800c000*/ IADD R3, R0, 0x4;
/*0038*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0040*/ /*0x0030dc8580000000*/ LD R3, [R3];
/*0048*/ /*0x0c00dde428000000*/ MOV R3, R3;
/*0050*/ /*0x0c209c0348000000*/ IADD R2, R2, R3;
/*0058*/ /*0x20001c034800c000*/ IADD R0, R0, 0x8;
/*0060*/ /*0x00001de428000000*/ MOV R0, R0;
/*0068*/ /*0x00009c8590000000*/ ST [R0], R2;
/*0070*/ /*0x40001de740000000*/ BRA 0x88;
/*0078*/ /*0x00001de780000000*/ EXIT;
/*0080*/ /*0x00001de780000000*/ EXIT;
/*0088*/ /*0x00001de780000000*/ EXIT;
/*0090*/ /*0x00001de780000000*/ EXIT;
/*0098*/ /*0xe0001de74003ffff*/ BRA 0x98;
/*00a0*/ /*0x00001de440000000*/ NOP CC.T;
/*00a8*/ /*0x00001de440000000*/ NOP CC.T;
/*00b0*/ /*0x00001de440000000*/ NOP CC.T;
/*00b8*/ /*0x00001de440000000*/ NOP CC.T;
.................................
这让我觉得你的问题是“当我禁用优化并为调试器编译时,为什么编译器不会生成最佳代码?”这是一个修辞问题,这是一个问题....
编辑:
并且有任何疑问,启用GPU调试会禁用编译器优化,请考虑'nvcc'的以下输出:
$ nvcc -arch=sm_30 -G -c --dryrun asmprob.cu
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/usr/local/cuda/bin
#$ _THERE_=/usr/local/cuda/bin
#$ _TARGET_SIZE_=
#$ TOP=/usr/local/cuda/bin/..
#$ PATH=/usr/local/cuda/bin/../open64/bin:/usr/local/cuda/bin/../nvvm:/usr/local/cuda/bin:/opt/local/bin:/opt/local/sbin:/Library/Frameworks/Python.framework/Versions/Current/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/usr/local/git/bin:/usr/texbin:/usr/X11/bin:/usr/NX/bin:/usr/local/bin:/Users/talonmies/bin:/usr/local/cuda/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../lib" -lcudart
#$ CUDAFE_FLAGS=
#$ OPENCC_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -D__CUDA_ARCH__=300 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda/bin/../include" -include "cuda_runtime.h" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-6_asmprob.cpp1.ii" "asmprob.cu"
#$ cudafe --m32 --gnu_version=40201 -tused --no_remove_unneeded_entities --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.c" --stub_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.gpu" --nv_arch "compute_30" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" --include_file_name "tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" "/tmp/tmpxft_00005ceb_00000000-6_asmprob.cpp1.ii"
#$ gcc -D__CUDA_ARCH__=300 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-7_asmprob.cpp2.i" "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.gpu"
#$ cudafe -w --m32 --gnu_version=40201 --c --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.c" --stub_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.gpu" --nv_arch "compute_30" --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" --include_file_name "tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" "/tmp/tmpxft_00005ceb_00000000-7_asmprob.cpp2.i"
#$ gcc -D__CUDA_ARCH__=300 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDABE__ -D__CUDANVVM__ -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" "/tmp/tmpxft_00005ceb_00000000-8_asmprob.cudafe2.gpu"
#$ filehash -s " -g --dont-merge-basicblocks --return-at-end " "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" > "/tmp/tmpxft_00005ceb_00000000-10_asmprob.hash"
#$ gcc -E -x c++ -D__CUDACC__ -D__NVCC__ "-I/usr/local/cuda/bin/../include" -include "cuda_runtime.h" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-4_asmprob.cpp4.ii" "asmprob.cu"
#$ cudafe++ --m32 --gnu_version=40201 --parse_templates --debug_mode --gen_c_file_name "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.cpp" --stub_file_name "tmpxft_00005ceb_00000000-3_asmprob.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_00005ceb_00000000-2_asmprob.module_id" "/tmp/tmpxft_00005ceb_00000000-4_asmprob.cpp4.ii"
#$ cicc -arch compute_30 -m32 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -g -O0 "/tmp/tmpxft_00005ceb_00000000-11_asmprob" "/tmp/tmpxft_00005ceb_00000000-9_asmprob.cpp3.i" -o "/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx"
#$ ptxas -arch=sm_30 -m32 -g --dont-merge-basicblocks --return-at-end "/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx" -o "/tmp/tmpxft_00005ceb_00000000-12_asmprob.sm_30.cubin"
#$ fatbinary --create="/tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin" -32 --key="xxxxxxxxxx" --ident="asmprob.cu" --cmdline=" -g --dont-merge-basicblocks --return-at-end " -g "--image=profile=sm_30,file=/tmp/tmpxft_00005ceb_00000000-12_asmprob.sm_30.cubin" "--image=profile=compute_30,file=/tmp/tmpxft_00005ceb_00000000-5_asmprob.ptx" --embedded-fatbin="/tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin.c" --cuda
#$ rm /tmp/tmpxft_00005ceb_00000000-1_asmprob.fatbin
#$ gcc -D__CUDA_ARCH__=300 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDA_PREC_DIV -D__CUDA_PREC_SQRT "-I/usr/local/cuda/bin/../include" -m32 -malign-double -o "/tmp/tmpxft_00005ceb_00000000-13_asmprob.ii" "/tmp/tmpxft_00005ceb_00000000-3_asmprob.cudafe1.cpp"
#$ gcc -c -x c++ "-I/usr/local/cuda/bin/../include" -fpreprocessed -m32 -malign-double -o "asmprob.o" "/tmp/tmpxft_00005ceb_00000000-13_asmprob.ii"
请注意设备代码编译阶段命令:
cicc -arch compute_30 -m32 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -g -O0 <----
即。调试构建编译时优化设置为0。