我有这个内核
__global__ void kernel1(int keep, int include, int width, int* d_Xco,
int* d_Xnum, bool* d_Xvalid, float* d_Xblas)
{
int i, k;
i = threadIdx.x + blockIdx.x * blockDim.x;
if(i < keep){
for(k = 0; k < include ; k++){
int val = (d_Xblas[i*include + k] >= 1e5);
int aux = d_Xnum[i];
d_Xblas[i*include + k] *= (!val);
d_Xco[i*width + aux] = k;
d_Xnum[i] +=val;
d_Xvalid[i*include + k] = (!val);
}
}
}
启动
int keep = 9000;
int include = 23000;
int width = 0.2*include;
int threads = 192;
int blocks = keep+threads-1/threads;
kernel1 <<< blocks,threads >>>( keep, include, width,
d_Xco, d_Xnum, d_Xvalid, d_Xblas );
这个kernel1
工作正常,但显然没有完全优化。我认为消除内循环k
会很简单但由于某种原因它不能正常工作。
我的第一个想法是:
__global__ void kernel2(int keep, int include, int width,
int* d_Xco, int* d_Xnum, bool* d_Xvalid,
float* d_Xblas)
{
int i, k;
i = threadIdx.x + blockIdx.x * blockDim.x;
k = threadIdx.y + blockIdx.y * blockDim.y;
if((i < keep) && (k < include) ) {
int val = (d_Xblas[i*include + k] >= 1e5);
int aux = d_Xnum[i];
d_Xblas[i*include + k] *= (float)(!val);
d_Xco[i*width + aux] = k;
atomicAdd(&d_Xnum[i], val);
d_Xvalid[i*include + k] = (!val);
}
}
使用2D网格启动:
int keep = 9000;
int include = 23000;
int width = 0.2*include;
int th = 32;
dim3 threads(th,th);
dim3 blocks ((keep+threads.x-1)/threads.x, (include+threads.y-1)/threads.y);
kernel2 <<< blocks,threads >>>( keep, include, width, d_Xco, d_Xnum,
d_Xvalid, d_Xblas );
虽然我相信这个想法很好,但它不起作用,我在这里没有想法。你能帮帮我吗?我还认为问题可能在d_Xco
中,它将位置k
存储在一个较小的数组中,并将它们推送到数组的开头,因此顺序很重要。
d_Xco
-------------------------------
| 2|3 |15 |4 |5 |5 | | | | | | .......
-------------------------------
答案 0 :(得分:1)
在原始代码中,您有
for(k = 0; k < include ; k++){
...
int aux = d_Xnum[i];
...
d_Xco[i*width + aux] = k;
...
}
d_Xco
数组的索引不依赖于k
,因此每次迭代写入它都是多余的。最终值始终为include-1
。因此,在k
循环中用一行在 k
循环中替换这两行:
d_Xco[i*width + d_Xnum[i]] = include - 1;
一旦你这样做,当你并行k
循环时,当许多k
个线程为d_Xco
中的同一位置分配不同的值时,你将不再具有当前的竞争条件同时(不保证订购)。