我有一个简单的内核,它基本上从双精度列表中计算最大值并存储其索引。所有工作都在当地完成,工作组规模为64。
简而言之,我创建了一个索引列表,用连续数字(基于local_id)填充它,然后根据适应值移动。
目前我的主要问题是,读取这个索引列表会返回非常疯狂的值。我有一个printf
语句,虽然它应该打印这样的东西
>> my_index 1 ; other_index 33 ; local_id 1 <<
>> my_index 2; other_index 34; local_id 2 <<
>> .......
我得到的输出是
>> my_index 1600085855 ; other_index 32 ; local_id 0 <<
>> my_index 1072652127 ; other_index 33 ; local_id 1 <<
>> my_index 942797699 ; other_index 34 ; local_id 2 <<
>> my_index 1072923423 ; other_index 35 ; local_id 3 <<
>> my_index -987348804 ; other_index 36 ; local_id 4 <<
>> my_index 1072849931 ; other_index 37 ; local_id 5 <<
>> my_index -833351863 ; other_index 38 ; local_id 6 <<
>> my_index 1073209710 ; other_index 39 ; local_id 7 <<
>> my_index -833351863 ; other_index 40 ; local_id 8 <<
>> my_index 1073209710 ; other_index 41 ; local_id 9 <<
>> my_index 1206451488 ; other_index 42 ; local_id 10 <<
>> my_index 1072822847 ; other_index 43 ; local_id 11 <<
>> my_index -1561806289 ; other_index 44 ; local_id 12 <<
>> my_index 1072836235 ; other_index 45 ; local_id 13 <<
>> my_index 1797893287 ; other_index 46 ; local_id 14 <<
>> my_index 1072863946 ; other_index 47 ; local_id 15 <<
>> my_index 1499829849 ; other_index 48 ; local_id 16 <<
>> my_index 1073309078 ; other_index 49 ; local_id 17 <<
>> my_index 1215556782 ; other_index 50 ; local_id 18 <<
>> my_index 1073623117 ; other_index 51 ; local_id 19 <<
>> my_index -1741202958 ; other_index 52 ; local_id 20 <<
>> my_index 1073061666 ; other_index 53 ; local_id 21 <<
>> my_index 1908874354 ; other_index 54 ; local_id 22 <<
>> my_index 1072809756 ; other_index 55 ; local_id 23 <<
>> my_index 1499829849 ; other_index 56 ; local_id 24 <<
>> my_index 1073309078 ; other_index 57 ; local_id 25 <<
>> my_index 1965493508 ; other_index 58 ; local_id 26 <<
>> my_index 1073421919 ; other_index 59 ; local_id 27 <<
>> my_index -1908874354 ; other_index 60 ; local_id 28 <<
>> my_index 1073101027 ; other_index 61 ; local_id 29 <<
>> my_index -1561806289 ; other_index 62 ; local_id 30 <<
>> my_index 31 ; other_index 63 ; local_id 31 <<
>> my_index 1600085855 ; other_index 1499829849 ; local_id 0 <<
>> my_index 1072652127 ; other_index 1073309078 ; local_id 1 <<
>> my_index 942797699 ; other_index 1215556782 ; local_id 2 <<
>> my_index 1072923423 ; other_index 1073623117 ; local_id 3 <<
>> my_index -987348804 ; other_index -1741202958 ; local_id 4 <<
>> my_index 1072849931 ; other_index 1073061666 ; local_id 5 <<
>> my_index -833351863 ; other_index 1908874354 ; local_id 6 <<
>> my_index 1073209710 ; other_index 1072809756 ; local_id 7 <<
>> my_index -833351863 ; other_index 1499829849 ; local_id 8 <<
>> my_index 1073209710 ; other_index 1073309078 ; local_id 9 <<
>> my_index 1206451488 ; other_index 1965493508 ; local_id 10 <<
>> my_index 1072822847 ; other_index 1073421919 ; local_id 11 <<
>> my_index -1561806289 ; other_index -1908874354 ; local_id 12 <<
>> my_index 1072836235 ; other_index 1073101027 ; local_id 13 <<
>> my_index 1797893287 ; other_index -1561806289 ; local_id 14 <<
>> my_index 1072863946 ; other_index 31 ; local_id 15 <<
>> my_index 1600085855 ; other_index -833351863 ; local_id 0 <<
>> my_index 1072652127 ; other_index 1073309078 ; local_id 1 <<
>> my_index 942797699 ; other_index 1965493508 ; local_id 2 <<
>> my_index 1073623117 ; other_index 1072822847 ; local_id 3 <<
>> my_index -1741202958 ; other_index -1908874354 ; local_id 4 <<
>> my_index 1072849931 ; other_index 1073101027 ; local_id 5 <<
>> my_index -833351863 ; other_index 1797893287 ; local_id 6 <<
>> my_index 1073209710 ; other_index 1072863946 ; local_id 7 <<
>> my_index -833351863 ; other_index -1908874354 ; local_id 0 <<
>> my_index 1073309078 ; other_index 1072849931 ; local_id 1 <<
>> my_index 942797699 ; other_index -833351863 ; local_id 2 <<
>> my_index 1073623117 ; other_index 1072863946 ; local_id 3 <<
>> my_index -833351863 ; other_index -833351863 ; local_id 0 <<
>> my_index 1073309078 ; other_index 1072863946 ; local_id 1 <<
>> my_index -833351863 ; other_index 1072863946 ; local_id 0 <<
怎么可能?
代码:
__kernel void reduce( __global char* inputAgent,
__global double* output,
__global double* bestFitness,
__local double* localFitness,
__local int* indexes,
const unsigned int size)
{
int local_id = get_local_id(0);
// populate local memory
if (local_id <= size) {
localFitness[local_id] = bestFitness[local_id];
} else {
localFitness[local_id] = 0;
}
//populate table with consecutive numbers
indexes[local_id] = local_id;
barrier(CLK_LOCAL_MEM_FENCE);
for(int offset = get_local_size(0) / 2;
offset > 0;
offset >>= 1) {
if (local_id < offset) {
// find greater fitness
double mine = localFitness[local_id];
double other = localFitness[local_id + offset];
localFitness[local_id] = (mine > other) ? mine : other;
// store index of this greater fitness
int my_index = indexes[local_id];
int other_index = indexes[local_id + offset];
indexes[local_id] = (mine > other) ? my_index : other_index;
printf(">> my_index %d ; other_index %d ; local_id %d <<",
my_index, other_index, local_id);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0) { //last, not mutated
// return
output[0] = localFitness[0];
// reuse found index
int bit_to_change = indexes[0];
if (bit_to_change < size) {
inputAgent[bit_to_change] = (inputAgent[bit_to_change] - 1) * -1;
}
}
}
欢迎任何建议。
修改:
我能够找到barrier
的问题。在添加更多printf
之后,一个就在第一个障碍之前,第二个就在之后:
[...]
indexes[localID] = localID;
printf("<<< Fit %f for %d\n", localFitness[localID], indexes[localID] );
barrier(CLK_LOCAL_MEM_FENCE);
printf("*** Fit %f for %d\n", localFitness[localID], indexes[localID] );
[...]
输出我得到:
<<< Fit 0.990099 for 0
<<< Fit 1.449275 for 1
<<< Fit 1.538462 for 2
<<< Fit 1.030928 for 3
[...]
******* Fit 0.990099 for -1636178018
******* Fit 1.449275 for 1072593383
******* Fit 1.538462 for -1184818564
******* Fit 1.030928 for 1072042407
******* Fit 2.222222 for -1688619621
******* Fit 2.222222 for 1072388533
[...]
表明barrier
无效;而且我不知道该怎么做。有什么想法吗?
答案 0 :(得分:0)
我能够(在我的同事的帮助下)找到问题所在。从Erlang到OpenCL的绑定存在一个小问题。稍后我会详细介绍我是如何找到它的(只是为了提供我的调试路径),但是现在我想把这个问题标记为已解决。