虽然不重要,但我有三个相同大小的3个缓冲区和一个示例kernel.cl(只是向量加法)。我省略了初始化和释放执行步骤。
我有两个写缓冲区(缓冲区A和B)和一个内核执行(如下所示)。我定义了2个回调。
void
CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("callback read executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
int* c_v = cb_data->c_v;
cl_event end = *(cb_data->end);
cl_int st;
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
c_v[1] = 1;
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set event callback (%d)\n", st);
}
void
CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
{
printf("callback kernel executed (%d)\n", ev_status);
callback_data* cb_data = (callback_data*)user_data;
cl_command_queue queue = *(cb_data->queue);
cl_mem buf_c = *(cb_data->buf_c);
int* c_v = cb_data->c_v;
int size = cb_data->size;
bool nested_callbacks = cb_data->nested_callbacks;
bool blocking = cb_data->blocking;
cl_event end = *(cb_data->end);
cl_event ev_read;
printf("c_v %p\n", (void*)c_v);
printf("c_v[0] = %d\n", c_v[0]);
cl_int st;
/* printf("about to flush\n"); */
/* clFlush(queue); */
/* printf("flushed\n"); */
size_t offset = 0;
/* size = size + 4; */
printf("about to read the c buffer\n");
/* why it does not work? - blocking to CL_TRUE */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset,
size * sizeof(int), c_v, 0, NULL, &ev_read);
/* size * sizeof(int), c_v, 0, NULL, NULL); */
if (nested_callbacks){
st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
printf("set event callback (%d)\n", st);
}
if (!nested_callbacks){
st = clSetUserEventStatus(end, CL_COMPLETE);
printf("set event callback (%d)\n", st);
printf("read buffer c_v - buf_c (%d)\n", st);
}
}
int
main()
{
// initialization
// Execute kernel
st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
(const size_t*)lws, 0, NULL, &ev_kernel);
/* (const size_t*)lws, 0, NULL, NULL); */
printf("nd range kernel1 (%d %s)\n", st, clErrorString(st));
end = clCreateUserEvent(context, &st);
printf("create user event (%d)\n", st);
callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));
printf("c_v %p\n", (void*)c_v);
user_data->queue = &queue;
user_data->buf_c = &buf_c;
user_data->c_v = c_v;
user_data->size = size;
user_data->end = &end;
user_data->nested_callbacks = use_nested_callbacks;
use_data->blocking = use_blocking;
if (use_callbacks){
st = clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
printf("set event callback (%d)\n", st);
}
/* printf("finish queue\n"); */
/* clFinish(queue); */
/* printf("finished queue\n"); */
if (use_callbacks){
printf("waiting for events\n");
/* /\* cl_event events[] = {ev_kernel}; *\/ */
cl_event events[] = {end};
clWaitForEvents(1, events); // ev_kernel);
printf("waited for events\n");
}else{
printf("about to read the c buffer\n");
st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0,
size * sizeof(int), c_v, 0, NULL, NULL);
printf("read buffer c_v - buf_c (%d)\n", st);
}
free(user_data);
// ...
#define THRESHOLD 0
// check
printf("about to check (first: %d)\n", c_v[0]);
for (size_t i=0; i<size; i++){
if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD){
printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i, a_v[i], i, b_v[i], i, c_v[i]);
exit(EXIT_FAILURE);
}
}
// ...
}
我的问题与ra and和冻结有关。
三个问题:
use_callbacks = 1
,use_blocking = CL_FALSE
和use_nested_callbacks = 0
在检查C的内容时是否有任何潜在的竞争?(也许是clEnqueueReadBuffer
当我们在callback_kernel_fn
中检查C时,main
尚未完成。另外,如果我使用use_callbacks = 0
和use_blocking = CL_FALSE
,那么在检查C的内容时会有任何潜在的竞争吗?(所有这些都在同一个函数main
中)。 clFinish
之间的异步行为有何不同?使用clFinish
我需要提供queue
,但是我仍然可以从回调中修改队列以便能够继续执行吗? 我想用例。clEnqueueReadBuffer
use_blocking = CL_TRUE
callback_kernel_fn
时才会main
如果在use_callbacks = 1
完成? (因此,第一种情况使用use_callbacks = 0
,第二种情况使用spring.profiles.active=PROD
。使用例如NVIDIA或AMD GPU时不会发生这种情况。