以下代码来自the amd website
__kernel
void reduce(__global float* buffer,
__local float* scratch,
__const int length,
__global float* result) {
int global_index = get_global_id(0);
float accumulator = INFINITY;
// Loop sequentially over chunks of input vector
while (global_index < length) {
float element = buffer[global_index];
accumulator = (accumulator < element) ? accumulator : element;
global_index += get_global_size(0);
}
// Perform parallel reduction
int local_index = get_local_id(0);
scratch[local_index] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
for(int offset = get_local_size(0) / 2;
offset > 0;
offset = offset / 2) {
if (local_index < offset) {
float other = scratch[local_index + offset];
float mine = scratch[local_index];
scratch[local_index] = (mine < other) ? mine : other;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
result[get_group_id(0)] = scratch[0];
}
}
我对它进行了调整,使其可以减少总和:
__kernel
void reduce(__global float* buffer,
__local float* scratch,
__const int length,
__global float* result) {
int global_index = get_global_id(0);
float accumulator = 0.0;
// Loop sequentially over chunks of input vector
while (global_index < length) {
float element = buffer[global_index];
accumulator = accumulator + element;
global_index += get_global_size(0);
}
// Perform parallel reduction
int local_index = get_local_id(0);
scratch[local_index] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
for(int offset = get_local_size(0) / 2;
offset > 0;
offset = offset / 2) {
if (local_index < offset) {
float other = scratch[local_index + offset];
float mine = scratch[local_index];
scratch[local_index] = mine + other;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
result[get_group_id(0)] = scratch[0];
}
}
当我使用一个唯一的工作组(意思是我将NULL
作为local_work_size
添加到clEnqueueNDRangeKernel()
)时,它就像一个魅力,但当我尝试时,事情就失控了更改工作组维度。 (我应该说我是OpenCl的新手)
我的工作如下
#define GLOBAL_DIM 600
#define WORK_DIM 60
size_t global_1D[3] = {GLOBAL_DIM,1,1};
size_t work_dim[3] = {WORK_DIM,1,1};
err = clEnqueueNDRangeKernel(commands, av_velocity_kernel, 1, NULL, global_1D, work_dim, 0, NULL, NULL); //TODO CHECK THIS LINE
if (err) {
printf("Error: Failed to execute av_velocity_kernel!\n"); printf("\n%s",err_code(err)); fflush(stdout); return EXIT_FAILURE; }
我做错了吗?
此外,我注意到如果我设置#define GLOBAL_DIM 60000
(这是我需要的),我的本地内存耗尽。如果我使用多个工作组,或者本地内存在工作组之间均匀分布,我会获得“更多”本地内存吗?
答案 0 :(得分:0)
首先,如果工作组大小是2的幂,那些还原内核只能正常工作。这意味着你应该使用64而不是60.而且,更改GLOBAL_DIM无法让你耗尽本地内存:在调用内核时,你最有可能做错了。