编辑:此问题的结尾列出了一段时间内的成就(~1Tflops / s)。
我正在使用来自C ++ DLL的opencl(gpu)为C#编写某种数学库,并且已经对单精度方矩阵矩阵乘法进行了一些优化(用于学习目的以及稍后在神经网络程序中重新使用的可能性) )。在内核代码下面将v1 1D数组作为matrix1(1024x1024)和v2 1D数组的行作为matrix2((1024x1024)转置优化的列)并将结果放入v3 1D数组作为矩阵-3的行。(1024x1024)
目前,HD7870的1024x1024方阵矩阵乘法的内核执行时间为3.6 ms。
完成优化:
问题:我无法完成一些优化,例如消除所有本地(lds)银行冲突和指令重新排序以隐藏内存延迟。 我可以做些什么来改善这个数学函数的性能?
这个内核肯定是本地内存带宽(冲突)有限,乘法为3.2 ms =
(1024 * 1024 * 1024 *(1 sum + 1 mult = 2)/ 0.0036秒)= 596x10 ^ 9每秒翻牌(596 GFlops)我在GTX680上看到了一些CUDA的在线基准测试,他们打破了1TFlops点。因为每个计算单元或更多内核或两者都有更多本地内存?
(1024 * 1024 * 1024 *(2个浮点读取)*(每个浮点数4个字节)/0.0036秒)= 2386x10 ^ 9个字节/秒 但是这个内核读取8个浮点数并使用它们16次,每次浮点数重复使用2次。
2386x10 ^ 9字节/重复使用(2)= 1193 GB / s
HD7870的理论最大值为:here, appendix D
计算能力=每秒2560 Giga浮点运算,LDS带宽= 2560 GB / s,寄存器访问带宽= 15360 GB / s
这是内核:
__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3)
{
int localRow = get_local_id(0);
int localCol = get_local_id(1);
int selectRowFromA = get_group_id(0)*32;
int selectColFromB = get_group_id(1)*32;
int lid= localCol*16+localRow;
__local float Lcache1[ 16][ 16];
__local float Lcache2[ 16][ 16];
__local float Lcache3[ 16][ 16];
__local float Lcache1a[ 16][ 16];
__local float Lcache2a[ 16][ 16];
__local float Lcache3a[ 16][ 16];
__local float Lcache1b[ 16][ 16];
__local float Lcache2b[ 16][ 16];
__local float Lcache3b[ 16][ 16];
__local float Lcache1c[ 16][ 16];
__local float Lcache2c[ 16][ 16];
__local float Lcache3c[ 16][ 16];
float tmp0=0.0f;
float tmp1=0.0f;
float tmp2=0.0f;
float tmp3=0.0f;
float tmp4=0.0f;
float tmp5=0.0f;
float tmp6=0.0f;
float tmp7=0.0f;
float sumPatch=0.0f;
float sumPatcha=0.0f;
float sumPatchb=0.0f;
float sumPatchc=0.0f;
float sumPatch2=0.0f;
float sumPatcha2=0.0f;
float sumPatchb2=0.0f;
float sumPatchc2=0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
Lcache3[localRow][localCol]=0.0f;
Lcache3a[localRow][localCol]=0.0f;
Lcache3b[localRow][localCol]=0.0f;
Lcache3c[localRow][localCol]=0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i=0;i<1024;i+=32) // this is A's row and B's column parsed by sub-matrices
{
barrier(CLK_LOCAL_MEM_FENCE);
Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
barrier(CLK_LOCAL_MEM_FENCE);
sumPatch=0.0f;
sumPatcha=0.0f;
sumPatchb=0.0f;
sumPatchc=0.0f;
sumPatch2=0.0f;
sumPatcha2=0.0f;
sumPatchb2=0.0f;
sumPatchc2=0.0f;
for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
{
read_mem_fence(CLK_LOCAL_MEM_FENCE);
tmp0=Lcache1[kk][localRow]; // row-major
tmp1=Lcache1a[kk][localRow]; // accesses
tmp2=Lcache1b[kk][localRow]; //to local memory
tmp3=Lcache1c[kk][localRow];
tmp4=Lcache2[kk][localCol];
tmp5=Lcache2a[kk][localCol];
tmp6=Lcache2b[kk][localCol];
tmp7=Lcache2c[kk][localCol];
read_mem_fence(CLK_LOCAL_MEM_FENCE);
sumPatch+=tmp0*tmp4;
sumPatcha+=tmp0*tmp6;
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
sumPatch2+=tmp1*tmp5;
sumPatcha2+=tmp1*tmp7;
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
}
Lcache3[localRow][localCol]+=sumPatch+sumPatch2;
Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2;
Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2;
Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2;
}
barrier(CLK_LOCAL_MEM_FENCE);
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];
barrier(CLK_LOCAL_MEM_FENCE);
}
以下是我试图消除银行冲突,但内核执行时间增加了大约20%:
for(int kk=0;kk< 16;kk++)
{
int nc=(kk+lid)&15;//different for all local threads
//but does not exceed 0-15 range
//summation order is not important
//0.+1.+...15. or 14.+15.+0.+..13.
//gives correct answer
read_mem_fence(CLK_LOCAL_MEM_FENCE);
tmp0=Lcache1[nc][localRow];
tmp1=Lcache1a[nc][localRow];
tmp2=Lcache1b[nc][localRow];
tmp3=Lcache1c[nc][localRow];
tmp4=Lcache2[nc][localCol];
tmp5=Lcache2a[nc][localCol];
tmp6=Lcache2b[nc][localCol];
tmp7=Lcache2c[nc][localCol];
read_mem_fence(CLK_LOCAL_MEM_FENCE);
sumPatch+=tmp0*tmp4;
sumPatcha+=tmp0*tmp6;
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
sumPatch2+=tmp1*tmp5;
sumPatcha2+=tmp1*tmp7;
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
}
这可能是新gpus的广播技术吗?总计超过16个元素意味着只使用了16个银行?该设备有32个银行供本地访问。
以下是我试图隐藏内存延迟的内容:
for(int kk=0;kk< 16;kk++)
{
int nc=(kk+lid)&15;//different for all local threads
//but does not exceed 0-15 range
//summation order is not important
//0.+1.+...15. or 14.+15.+0.+..13.
//gives correct answer
read_mem_fence(CLK_LOCAL_MEM_FENCE);
tmp0=Lcache1[nc][localRow];
tmp4=Lcache2[nc][localCol];
sumPatch+=tmp0*tmp4;
tmp6=Lcache2b[nc][localCol];
sumPatcha+=tmp0*tmp6;
tmp1=Lcache1a[nc][localRow];
tmp7=Lcache2c[nc][localCol];
sumPatcha2+=tmp1*tmp7;
tmp5=Lcache2a[nc][localCol];
sumPatch2+=tmp1*tmp5;
tmp2=Lcache1b[nc][localRow];
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
tmp3=Lcache1c[nc][localRow];
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
read_mem_fence(CLK_LOCAL_MEM_FENCE);//this lines' position does not change time
}
但这并没有增加或减少执行。时间。
如何改善内核时间?可行?
设备:HD7870 @ 1000MHz / 1200MHz 主持人:FX8150 @ 4GHz 标题,来自Khronos网站的LIB文件,来自AMD驱动程序的opencl.dll。
时间采样包括:将内核cycyling 100次,并将Stopwatch
方法的总时间除以100.0作为start()和stop()。仅用于执行,不包括数组副本。
将所有结果与具有相同输入随机矩阵的朴素3嵌套循环版本进行比较(结果在m(ij)+/- delta内,其中delta为0.001f。)
这里的内核是更通用的内核的简化版本(针对不同的矩阵和补丁大小)
此版本的内核参数:Global = 512,512 Local = 16,16,Referance = 0,0
对于8320x8320矩阵---&gt; Global = 4160,4160,Local = 16,16,ref = 0,0 time = 1.87Seconds
编辑:根据DarkZeros的建议,通过私有版本替换本地Lcache3可将1024x1024时间改进为2.7毫秒。这是每秒795 GFlops。这必须来自更好的占有率。
编辑2:较少的本地使用率开启了使用48x48(9 x 16x16)补丁的可能性,这使得1056x1056乘法2.4毫秒----> 981 Gflops / s。 8208x8208在961ms完成,超过1150 GFlops。
答案 0 :(得分:2)
为什么这么多围栏?事实上,我认为你根本不需要它们。当线程写入本地的线程将被其他线程重新占用时,您只需要一个栅栏。不是当该线程读写本地内存时。
BTW围栏比障碍好得多。在屏障中,您强制线程同步。在某些情况下,这会影响性能。
我认为您可以通过更改内存访问模型来重写代码以获得相当多的速度。
你可以试试这是否更好(我做了许多明显的优化,不知道你的代码甚至在做什么):
__kernel void squareGpuMatrixMul(__global float * v1, __global float * v2, __global float * v3)
{
int localRow = get_local_id(0);
int localCol = get_local_id(1);
int selectRowFromA = get_group_id(0)*32;
int selectColFromB = get_group_id(1)*32;
int lid= localCol*16+localRow;
__local float Lcache1[ 16][ 16];
__local float Lcache2[ 16][ 16];
__local float Lcache3[ 16][ 16];
__local float Lcache1a[ 16][ 16];
__local float Lcache2a[ 16][ 16];
__local float Lcache3a[ 16][ 16];
__local float Lcache1b[ 16][ 16];
__local float Lcache2b[ 16][ 16];
__local float Lcache3b[ 16][ 16];
__local float Lcache1c[ 16][ 16];
__local float Lcache2c[ 16][ 16];
__local float Lcache3c[ 16][ 16];
float tmp0=0.0f;
float tmp1=0.0f;
float tmp2=0.0f;
float tmp3=0.0f;
float tmp4=0.0f;
float tmp5=0.0f;
float tmp6=0.0f;
float tmp7=0.0f;
float sumPatch=0.0f;
float sumPatcha=0.0f;
float sumPatchb=0.0f;
float sumPatchc=0.0f;
float sumPatch2=0.0f;
float sumPatcha2=0.0f;
float sumPatchb2=0.0f;
float sumPatchc2=0.0f;
Lcache3[localRow][localCol]=0.0f;
Lcache3a[localRow][localCol]=0.0f;
Lcache3b[localRow][localCol]=0.0f;
Lcache3c[localRow][localCol]=0.0f;
for(int i=0;i<1024;i+=32) // this is A's row and B's column parsed by sub-matrices
{
Lcache1[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024];
Lcache2[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024];
Lcache1a[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16];
Lcache2a[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16];
Lcache1b[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+16384];
Lcache2b[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+16384];
Lcache1c[localCol][localRow]=v1[selectRowFromA*1024+i+localCol+localRow*1024+ 16+16384];
Lcache2c[localRow][localCol]=v2[selectColFromB*1024+i+localRow+localCol*1024+ 16+16384];
mem_fence(CLK_LOCAL_MEM_FENCE);
sumPatch=0.0f;
sumPatcha=0.0f;
sumPatchb=0.0f;
sumPatchc=0.0f;
sumPatch2=0.0f;
sumPatcha2=0.0f;
sumPatchb2=0.0f;
sumPatchc2=0.0f;
for(int kk=0;kk< 16;kk++) //this is sub-matrix multiplication
{
tmp0=Lcache1[kk][localRow]; // row-major
tmp1=Lcache1a[kk][localRow]; // accesses
tmp2=Lcache1b[kk][localRow]; //to local memory
tmp3=Lcache1c[kk][localRow];
tmp4=Lcache2[kk][localCol];
tmp5=Lcache2a[kk][localCol];
tmp6=Lcache2b[kk][localCol];
tmp7=Lcache2c[kk][localCol];
sumPatch+=tmp0*tmp4;
sumPatcha+=tmp0*tmp6;
sumPatchb+=tmp2*tmp4;
sumPatchc+=tmp2*tmp6;
sumPatch2+=tmp1*tmp5;
sumPatcha2+=tmp1*tmp7;
sumPatchb2+=tmp3*tmp5;
sumPatchc2+=tmp3*tmp7;
}
Lcache3[localRow][localCol]+=sumPatch+sumPatch2;
Lcache3a[localRow][localCol]+=sumPatcha+sumPatcha2;
Lcache3b[localRow][localCol]+=sumPatchb+sumPatchb2;
Lcache3c[localRow][localCol]+=sumPatchc+sumPatchc2;
}
mem_fence(CLK_LOCAL_MEM_FENCE);
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024]=Lcache3[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16]=Lcache3a[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+16384]=Lcache3b[localRow][localCol];
v3[selectRowFromA*1024+selectColFromB+localCol+localRow*1024+ 16+16384]=Lcache3c[localRow][localCol];
}