Question

我正在尝试在工作的简化内核中包含类似于DarkZeros here所描述的局部原子。内核在一组点中找到最大值;本地原子的目的是允许我将选定的point_id过滤到输出数组中而没有任何间隙。

目前，当我使用本地原子来增加对本地数组的添加时，内核会运行，但会产生错误的整体最高点。如果原子线被注释掉，那么返回正确的结果。

这里发生了什么以及如何解决？

简化的内核代码：

__kernel void reduce(__global const float4* dataSet, __global const int* input, const unsigned int items,                                   //points and index
                    __global int* output, __local float4* shared, const unsigned int n,                                                 //finding highest
                        __global int* filtered, __global const float2* tri_input, const unsigned int pass,                              //finding filtered
                            __global int* global_count                                                                                  //global count
                                ){
//set everything up

const unsigned int group_id = get_global_id(0) / get_local_size(0);
const unsigned int local_id = get_local_id(0);
const unsigned int group_size = items;
const unsigned int group_stride = 2 * group_size;
const int local_stride = group_stride * group_size;

__local float4 *zeroIt = &shared[local_id];
zeroIt->x = 0; zeroIt->y = 0; zeroIt->z = 0; zeroIt->w = 0;

volatile __local int local_count_set_1;
volatile __local int global_val_set_1;
volatile __local int filter_local[64];

if(local_id==0){
    local_count_set_1 = 0;
    global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);

int i = group_id * group_stride + local_id;

while (i < n){
//load up a pair of points using the index to locate them within a massive dataSet
    int ia = input[i];
    float4 a = dataSet[ia-1];

    int ib = input[i + group_size];
    float4 b = dataSet[ib-1];

    //on the first pass kernel increment a local count
    if(pass == 0){
        filter_local[atomic_inc(&local_count_set_1)] = 1;  //including this line causes an erroneous highest point result
        //filter_local[local_id] = 1; //but including this line does not
        //atomic_inc(&local_count_set_1); //and neither does this one
    }

    //find the highest of the pair
    float4 result;
    if(a.z>b.z) result = a;
    else result = b;

    //load up the previous highest result locally
    float4 s = shared[local_id];

    //if the previous highest beat this, stick, else twist
    if(s.z>result.z){ result = s; }
    shared[local_id] = result;
    i += local_stride;
}

barrier(CLK_LOCAL_MEM_FENCE);
if (group_size >= 512){
    if (local_id < 256) {
        __local float4 *a = &shared[local_id];
        __local float4 *b = &shared[local_id+256];
        if(b->z>a->z){  shared[local_id] = shared[local_id+256]; }
    }}

//repeat barrier ops in increments down to group_size>=2 - this filters the highest result in shared
//finally, return the filtered highest result of shared to the global level

barrier(CLK_LOCAL_MEM_FENCE);
    if(local_id == 0){
        __local float4 *v = &shared[0];
        int send = v->w ;
        output[group_id] = send+1;
    }}

[更新]：当包含atomic_inc行时，“错误的”最高点结果始终是接近测试数据集末尾的点。我猜这意味着atomic_inc正在影响后一种比较，但我不确定究竟是什么或在哪里。

[更新]：通过调试调整来编辑代码以简化/澄清/更新。仍然没有工作，它正在推动我循环。

Answer 1

总脸部时刻。在内核的设置阶段，有以下几行：

if(local_id==0){
   local_count_set_1 = 0;
   global_val_set_1 = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);

当这些被拆分并且local_count_set_1包含在while循环中时，不会发生错误。即：

if(local_id==0) global_val_set_1 = -1;
barrier(CLK_LOCAL_MEM_FENCE);

while (i < n){
    if(local_id==0) local_count_set_1 = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    ....
    if(pass = 0){
        filter_local[atomic_inc(&local_count_set_1)] = 1;
    }
    ....

我希望这可以解决问题//如果没有，我会更新。

Aaaand，这是一个我永远不会回来的周末。

OpenCL：将本地atomic_inc插入到还原内核

1 个答案: