我运行一个大小为2的工作组的简单OpenCL内核。内核在结果数组中存储全局ID mod 2,对于索引为0的每个线程,将组的结果数组的总和存储在reduce数组中。返回到C代码时,在最后20个元素之前,数字不匹配。发生了什么事?
OpenCL内核:
__kernel void hello (__global long* reduceBuffer, __global long* result, local long* sum) {
int id = get_global_id(0);
int tnum = get_local_id(0);
int wgNum = get_group_id(0);
int numItems = get_group_id(0);
result[id] = id % 2;
sum[tnum] = result[id];
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (tnum == 0) reduceBuffer[wgNum] = sum[0] + sum[1];
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
}
C代码:
int main() {
size_t numElements = 128;
size_t workGroupSize = 2;
// OpenCL preamble with lots of buffer initialization
clEnqueueNDRangeKernel (command_queue, kernel, 1, NULL, &numElements, &workGroupSize, 0, NULL, NULL);
// read result from write buffers
for (int i = 0; i < numElements/workGroupSize; i++) {
printf("%d. %ld %ld\n", i, reduceArr[i], resultArr[2 * i] + resultArr[2 * i + 1]);
}
// free everything
return 0;
}
输出:
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 0 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
- 1 1
醇>