我正在开发一组OpenCL内核,其中每个内核都执行相同数量的浮点运算。每个内核都有一个“for”循环,每个循环内部都有展开的操作。内核之间的唯一区别是,每个内核循环的数量与其他内核不同。换句话说,一些内核具有更多的展开操作和更少的迭代,而另一些内核具有更多的迭代,并且展开的操作更少。
以下是我写的内核:
Add16-20Unroll-282Iters(迭代282次,每次迭代进行20次计算)
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void Add16(__global double *data, int nIters) {
int gid = get_global_id(0), globalSize = get_global_size(0);
double s = data[gid];
double16 s0 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
for (int j = 0; j < nIters; ++j){
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
}
data[gid] = s0.s0+s0.s1+s0.s2+s0.s3+s0.s4+s0.s5+s0.s6+s0.s7+s0.s8+s0.s9+s0.sa+s0.sb+s0.sc+s0.sd+s0.se+s0.sf;
}
添加16-30Unroll-141Iters(迭代141次,每次迭代进行30次计算)
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void Add16(__global double *data, int nIters) {
int gid = get_global_id(0), globalSize = get_global_size(0);
double s = data[gid];
double16 s0 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
for (int j = 0; j < nIters; ++j){
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
s0=10.f-s0;
}
data[gid] = s0.s0+s0.s1+s0.s2+s0.s3+s0.s4+s0.s5+s0.s6+s0.s7+s0.s8+s0.s9+s0.sa+s0.sb+s0.sc+s0.sd+s0.se+s0.sf;
}
另外两个内核,“Add16-60Unroll-70Iters”,“Add16-120Unroll-35Iters”和“Add16-240Unroll-17Iters”。
我的第一个期望是,具有更多迭代的内核比展开的版本表现更差,但不幸的是情况并非如此。以下是我在代码中计算的性能结果。
test atts units median mean stddev min max trial0 trial1 trial2 trial3 trial4 trial5 trial6 trial7 trial8 trial9
Add16-20Unroll-282Iters-DP Size:2097152 GFLOPS 704.111 704.16 0.158984 704.009 704.501 704.019 704.064 704.37 704.024 704.234 704.501 704.009 704.188 704.158 704.033
Add16-30Unroll-141Iters-DP Size:2097152 GFLOPS 704.622 704.698 0.422365 704.243 705.527 704.318 704.671 704.256 704.574 704.243 704.681 704.87 704.47 705.527 705.37
Add16-60Unroll-70Iters-DP Size:2097152 GFLOPS 705.891 705.844 0.201042 705.286 706.058 705.864 705.974 705.937 705.871 705.919 706.058 705.747 705.87 705.286 705.91
Add16-120Unroll-35Iters-DP Size:2097152 GFLOPS 705.912 705.851 0.292976 705.284 706.373 705.91 705.67 705.962 705.987 706.373 705.815 706.103 705.284 705.914 705.493
Add16-240Unroll-17Iters-DP Size:2097152 GFLOPS 676.128 676.217 0.373075 675.777 676.845 676.105 676.266 676.151 675.777 675.968 676.839 675.787 676.49 676.845 675.94
所以我的问题是,我在这里想念一下吗?或者在编译内核时是否可能发生特定的优化?
一般情况下,您如何建议在GPU上运行时观察循环对OpenCL性能的影响?
编辑:
这里我改变了内核,以确保与for循环中的操作没有依赖关系。所以不是仅仅在s0上进行计算,而是创建s(ids)。以下是Add16-Unroll20-282Iters的修改内核:
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void Add16(__global double *data, int nIters) {
int gid = get_global_id(0), globalSize = get_global_size(0);
double s = data[gid];
double16 s0 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s1 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s2 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s3 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s4 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s5 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s6 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s7 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s8 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s9 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s10 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s11 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s12 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s13 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s14 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s15 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s16 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s17 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s18 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
double16 s19 = s + (double16)(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5);
for (int j = 0; j < nIters; ++j){
s0=10.f-s0;
s1=10.f-s1;
s2=10.f-s2;
s3=10.f-s3;
s4=10.f-s4;
s5=10.f-s5;
s6=10.f-s6;
s7=10.f-s7;
s8=10.f-s8;
s9=10.f-s9;
s10=10.f-s10;
s11=10.f-s11;
s12=10.f-s12;
s13=10.f-s13;
s14=10.f-s14;
s15=10.f-s15;
s16=10.f-s16;
s17=10.f-s17;
s18=10.f-s18;
s19=10.f-s19;
}
data[gid] += s0.s0+s0.s1+s0.s2+s0.s3+s0.s4+s0.s5+s0.s6+s0.s7+s0.s8+s0.s9+s0.sa+s0.sb+s0.sc+s0.sd+s0.se+s0.sf;
data[gid] += s1.s0+s1.s1+s1.s2+s1.s3+s1.s4+s1.s5+s1.s6+s1.s7+s1.s8+s1.s9+s1.sa+s1.sb+s1.sc+s1.sd+s1.se+s1.sf;
data[gid] += s2.s0+s2.s1+s2.s2+s2.s3+s2.s4+s2.s5+s2.s6+s2.s7+s2.s8+s2.s9+s2.sa+s2.sb+s2.sc+s2.sd+s2.se+s2.sf;
data[gid] += s3.s0+s3.s1+s3.s2+s3.s3+s3.s4+s3.s5+s3.s6+s3.s7+s3.s8+s3.s9+s3.sa+s3.sb+s3.sc+s3.sd+s3.se+s3.sf;
data[gid] += s4.s0+s4.s1+s4.s2+s4.s3+s4.s4+s4.s5+s4.s6+s4.s7+s4.s8+s4.s9+s4.sa+s4.sb+s4.sc+s4.sd+s4.se+s4.sf;
data[gid] += s5.s0+s5.s1+s5.s2+s5.s3+s5.s4+s5.s5+s5.s6+s5.s7+s5.s8+s5.s9+s5.sa+s5.sb+s5.sc+s5.sd+s5.se+s5.sf;
data[gid] += s6.s0+s6.s1+s6.s2+s6.s3+s6.s4+s6.s5+s6.s6+s6.s7+s6.s8+s6.s9+s6.sa+s6.sb+s6.sc+s6.sd+s6.se+s6.sf;
data[gid] += s7.s0+s7.s1+s7.s2+s7.s3+s7.s4+s7.s5+s7.s6+s7.s7+s7.s8+s7.s9+s7.sa+s7.sb+s7.sc+s7.sd+s7.se+s7.sf;
data[gid] += s8.s0+s8.s1+s8.s2+s8.s3+s8.s4+s8.s5+s8.s6+s8.s7+s8.s8+s8.s9+s8.sa+s8.sb+s8.sc+s8.sd+s8.se+s8.sf;
data[gid] += s9.s0+s9.s1+s9.s2+s9.s3+s9.s4+s9.s5+s9.s6+s9.s7+s9.s8+s9.s9+s9.sa+s9.sb+s9.sc+s9.sd+s9.se+s9.sf;
data[gid] += s10.s0+s10.s1+s10.s2+s10.s3+s10.s4+s10.s5+s10.s6+s10.s7+s10.s8+s10.s9+s10.sa+s10.sb+s10.sc+s10.sd+s10.se+s10.sf;
data[gid] += s11.s0+s11.s1+s11.s2+s11.s3+s11.s4+s11.s5+s11.s6+s11.s7+s11.s8+s11.s9+s11.sa+s11.sb+s11.sc+s11.sd+s11.se+s11.sf;
data[gid] += s12.s0+s12.s1+s12.s2+s12.s3+s12.s4+s12.s5+s12.s6+s12.s7+s12.s8+s12.s9+s12.sa+s12.sb+s12.sc+s12.sd+s12.se+s12.sf;
data[gid] += s13.s0+s13.s1+s13.s2+s13.s3+s13.s4+s13.s5+s13.s6+s13.s7+s13.s8+s13.s9+s13.sa+s13.sb+s13.sc+s13.sd+s13.se+s13.sf;
data[gid] += s14.s0+s14.s1+s14.s2+s14.s3+s14.s4+s14.s5+s14.s6+s14.s7+s14.s8+s14.s9+s14.sa+s14.sb+s14.sc+s14.sd+s14.se+s14.sf;
data[gid] += s15.s0+s15.s1+s15.s2+s15.s3+s15.s4+s15.s5+s15.s6+s15.s7+s15.s8+s15.s9+s15.sa+s15.sb+s15.sc+s15.sd+s15.se+s15.sf;
data[gid] += s16.s0+s16.s1+s16.s2+s16.s3+s16.s4+s16.s5+s16.s6+s16.s7+s16.s8+s16.s9+s16.sa+s16.sb+s16.sc+s16.sd+s16.se+s16.sf;
data[gid] += s17.s0+s17.s1+s17.s2+s17.s3+s17.s4+s17.s5+s17.s6+s17.s7+s17.s8+s17.s9+s17.sa+s17.sb+s17.sc+s17.sd+s17.se+s17.sf;
data[gid] += s18.s0+s18.s1+s18.s2+s18.s3+s18.s4+s18.s5+s18.s6+s18.s7+s18.s8+s18.s9+s18.sa+s18.sb+s18.sc+s18.sd+s18.se+s18.sf;
data[gid] += s19.s0+s19.s1+s19.s2+s19.s3+s19.s4+s19.s5+s19.s6+s19.s7+s19.s8+s19.s9+s19.sa+s19.sb+s19.sc+s19.sd+s19.se+s19.sf;
}