编辑: njuffa是对的,这个版本是用-G编译的,它禁用了所有优化。随着负载和存储的向量化,新的SASS速度更快。
基于经典的例子,我在cuda中修改了两个版本的矢量加法。问题是,float4版本是float版本的两倍,数据大小减少了4倍。对两个内核进行分析清楚地表明,float4版本平均每次执行4次加载和4次存储,而float版本只对两者执行。这听起来像是一个关于未定位访问float4的noob问题,BTW由下面的PTX确认,但我找不到。
我正在使用Cuda 7.0 rc和quadro K4000。
关于在哪里看的任何想法?
编译选项?
__aligned__ keyword ?
__global__ void add_float(float *c, const float *a, const float *b)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}
__global__ void add_float4(float4 *c, const float4 *a, const float4 *b) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
float4 a1 = a[i];
float4 b1 = b[i];
float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;
c[i] = c1;
}
PTX就行:
float4 a1 = a[i];
表示:
...
ld.f32 %f1, [%rd6];
ld.f32 %f2, [%rd6+4];
ld.f32 %f3, [%rd6+8];
ld.f32 %f4, [%rd6+12];
st.f32 [%SP+12], %f4;
st.f32 [%SP+8], %f3;
st.f32 [%SP+4], %f2;
st.f32 [%SP+0], %f1;
...
SASS objdump说:
/*0108*/ MOV R10, R0; /* 0x2800000000029de4 */
/*0110*/ ISET.LT.AND R11, R0, RZ, PT; /* 0x108e0000fc02dc23 */
/*0118*/ MOV32I R13, 0x4; /* 0x1800000010035de2 */
/*0120*/ ISETP.LE.U32.AND P0, PT, R13, 0x20, PT; /* 0x198ec00080d1dc03 */
/*0128*/ ISUB R12, 0x20, R13; /* 0x4800c00080d31e03 */
/*0130*/ SHL R11, R11, R13; /* 0x6000000034b2dc03 */
/*0138*/ SHR.U32 R14, R10, R12; /* 0x5800000030a39c03 */
/* 0x22c2804282328047 */
/*0148*/ IADD R11, R11, R14; /* 0x4800000038b2dc03 */
/*0150*/ @!P0 IADD R12, R13, -0x20; /* 0x4800ffff80d32003 */
/*0158*/ @!P0 SHL R11, R10, R12; /* 0x6000000030a2e003 */
/*0160*/ SHL R10, R10, R13; /* 0x6000000034a29c03 */
/*0168*/ MOV R10, R10; /* 0x2800000028029de4 */
/*0170*/ MOV R11, R11; /* 0x280000002c02dde4 */
/*0178*/ IADD R8.CC, R8, R10; /* 0x4801000028821c03 */
/* 0x228042c042828047 */
/*0188*/ IADD.X R9, R9, R11; /* 0x480000002c925c43 */
/*0190*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0198*/ MOV R9, R9; /* 0x2800000024025de4 */
/*01a0*/ LD.E R10, [R8]; /* 0x8400000000829c85 */
/*01a8*/ IADD R12.CC, R8, 0x4; /* 0x4801c00010831c03 */
/*01b0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01b8*/ MOV R12, R12; /* 0x2800000030031de4 */
/* 0x2202828042c2e287 */
/*01c8*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01d0*/ LD.E R11, [R12]; /* 0x8400000000c2dc85 */
/*01d8*/ IADD R12.CC, R8, 0x8; /* 0x4801c00020831c03 */
/*01e0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01e8*/ MOV R12, R12; /* 0x2800000030031de4 */
/*01f0*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01f8*/ LD.E R12, [R12]; /* 0x8400000000c31c85 */
/* 0x2282c202828042c7 */
/*0208*/ IADD R8.CC, R8, 0xc; /* 0x4801c00030821c03 */
/*0210*/ IADD.X R9, R9, RZ; /* 0x48000000fc925c43 */
/*0218*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0220*/ MOV R9, R9; /* 0x2800000024025de4 */
/*0228*/ LD.E R8, [R8]; /* 0x8400000000821c85 */
/*0230*/ IADD R14.CC, R2, 0xc; /* 0x4801c00030239c03 */
/*0238*/ IADD.X R15, R3, RZ; /* 0x48000000fc33dc43 */
/* 0x22828042c2e28047 */
/*0248*/ MOV R14, R14; /* 0x2800000038039de4 */
/*0250*/ MOV R15, R15; /* 0x280000003c03dde4 */
/*0258*/ ST.E [R14], R8; /* 0x9400000000e21c85 */
/*0260*/ IADD R8.CC, R2, 0x8; /* 0x4801c00020221c03 */
/*0268*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*0270*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0278*/ MOV R9, R9; /* 0x2800000024025de4 */
/* 0x22c2e2828042c2e7 */
/*0288*/ ST.E [R8], R12; /* 0x9400000000831c85 */
/*0290*/ IADD R8.CC, R2, 0x4; /* 0x4801c00010221c03 */
/*0298*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02a0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02a8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02b0*/ ST.E [R8], R11; /* 0x940000000082dc85 */
/*02b8*/ IADD R8.CC, R2, RZ; /* 0x48010000fc221c03 */
/* 0x22820042e2828047 */
/*02c8*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02d0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02d8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02e0*/ ST.E [R8], R10; /* 0x9400000000829c85 */
以下是其余部分:
void CudaTest()
{
int size = 8192;
float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
float *host_a = (float*)malloc(4 * size * sizeof(float));
float *host_b = (float*)malloc(4 * size * sizeof(float));
float *host_c = (float*)malloc(4 * size * sizeof(float));
float4 *dev_a4 = 0;
float4 *dev_b4 = 0;
float4 *dev_c4 = 0;
float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
float4 *host_c4 = (float4*)malloc(size * sizeof(float4));
for (int i = 0; i < 4 * size; i++)
{
host_a[i] = rand() / RAND_MAX;
host_b[i] = rand() / RAND_MAX;
}
for (int i = 0; i < size; i++)
{
host_a4[i].x = rand() / RAND_MAX;
host_a4[i].y = rand() / RAND_MAX;
host_a4[i].z = rand() / RAND_MAX;
host_a4[i].w = rand() / RAND_MAX;
host_b4[i].x = rand() / RAND_MAX;
host_b4[i].y = rand() / RAND_MAX;
host_b4[i].z = rand() / RAND_MAX;
host_b4[i].w = rand() / RAND_MAX;
}
// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_CALL(cudaSetDevice(0));
// Allocate GPU buffers for three vectors (two input, one output) .
CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));
// Copy input vectors from host memory to GPU buffers.
CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));
int local = 256;
int N = size / local;
// Launch a kernel on the GPU with one thread for each element.
add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
CUDA_CALL(cudaDeviceSynchronize());
// Copy output vector from GPU buffer to host memory.
CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}
答案 0 :(得分:1)
GPU硬件提供的向量加载/存储指令的使用被认为是编译器应用的性能优化,因为代码使用标量加载和存储完全起作用。当代码由nvcc
和-G
编译时(通常用于调试),所有优化(包括加载和存储的向量化)都将被关闭。
要检查加载/存储矢量化,重要的是查看正在执行的实际机器代码(SASS),而不是PTX,它只是一个中间代码,由一个被调用的优化编译器组件编译成SASS ptxas
由驱动程序nvcc
调用。对cuobjdump --dump-sass
生成的可执行文件运行nvcc
以检查机器代码。