OpenCL回调挂起/冻结(死锁,pthread_cond_wait)

时间:2017-05-07 11:57:00

标签: events callback opencl

我创建了一个基本代码段:

内核:

__kernel void
kernel1(__global int* a, __global int* b, __global int* c, int size)
{

  int idx = get_global_id(0);

  if (idx >= 0 && idx < size){
    c[idx] = a[idx] + b[idx];
  }
}

代码:

#include <CL/cl.h>

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_FILE_SIZE 1024000

#include <sys/stat.h>
#include <sys/types.h>

typedef enum ocl_type_e_t {
  OCL_TYPE_NULL = 0,
  OCL_TYPE_CPU = 1,
  OCL_TYPE_GPU = 2,
  OCL_TYPE_IGPU = 3,
  OCL_TYPE_ACC = 4
} ocl_type_e_t;


const char*
cl_device_type_to_str(cl_device_type type)
{
  static char* strings[] = {
    "(invalid)", // invalid
    "CL_DEVICE_TYPE_CPU",
    "CL_DEVICE_TYPE_GPU",
    "CL_DEVICE_TYPE_ACCELERATOR",
    "CL_DEVICE_TYPE_CUSTOM",
    "CL_DEVICE_TYPE_DEFAULT",
    "CL_DEVICE_TYPE_ALL",
  };

  char* ret;

  switch (type) {
    case CL_DEVICE_TYPE_CPU:
      ret = strings[1];
      break;
    case CL_DEVICE_TYPE_GPU:
      ret = strings[2];
      break;
    case CL_DEVICE_TYPE_ACCELERATOR:
      ret = strings[3];
      break;
    case CL_DEVICE_TYPE_CUSTOM:
      ret = strings[4];
      break;
    case CL_DEVICE_TYPE_DEFAULT:
      ret = strings[5];
      break;
    case CL_DEVICE_TYPE_ALL:
      ret = strings[6];
      break;
    default:
      ret = strings[0];
      break;
  }
  return ret;
}

const char*
file_read(char* const path)
{
  struct stat st;
  /* st = (struct stat*)malloc(sizeof(stat)); */
  int error = stat(path, &st);
  if (error != 0) {
    printf("Invalid file %s\n", path);
    exit(EXIT_FAILURE);
  }

  int size_file = st.st_size;

  if (size_file > MAX_FILE_SIZE) {
    printf("File %s is bigger than the max allowed size (%d > %d bytes)\n",
           path, size_file, MAX_FILE_SIZE);
    exit(EXIT_FAILURE);
  }

  FILE* fp = fopen(path, "r");
  if (fp == NULL) {
    printf("Error opening the file %s\n", path);
    exit(EXIT_FAILURE);
  }

  char* const buf = (char* const)malloc(size_file);
  if (buf == NULL) {
    printf("Error allocating %d bytes for the contents of the file %s\n",
           size_file, path);
    exit(EXIT_FAILURE);
  }

  int size_read;
  while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) {
    ;
  }

  fclose(fp);

  return buf;
}



cl_event clb_events_waiting[100];
int clb_events_waiting_device[100];
int clb_events_init_read[100];
int clb_num_events_waiting = 0;

void
clbWaitEvents(int * c)
{
  if (clb_num_events_waiting > 0){
    printf("About to wait events: %d\n", clb_num_events_waiting);
    int i;
    int waiting = 0;
    cl_event ev_waiting[100];
    printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING\n", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING);
    for (i=0; i<clb_num_events_waiting; i++){
      cl_int ret;
      clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
      int dev = clb_events_waiting_device[i];
      int init = clb_events_init_read[i] / sizeof(int);
      printf("cl_event %s init %6d  [%d] = status %d (ref %p)\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]);

      if (ret != CL_COMPLETE){
        ev_waiting[waiting] = clb_events_waiting[i];
        waiting++;
      }
    }

    for (i=0; i<clb_num_events_waiting; i++){
      int dev = clb_events_waiting_device[i];
      int init = clb_events_init_read[i] / sizeof(int);
      printf("%s [%d] = %d, [%d] = %d, [%d] = %d\n", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]);
    }

    if (waiting > 0){
      printf("about to wait %d events\n", waiting);
      clWaitForEvents(waiting, ev_waiting);
      printf("wait events finished\n");
    }
    /* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */
  }
}














typedef struct callback_data
{
  cl_command_queue* queue;
  cl_mem* buf_c;
  int* c_v;
  uint size;
  cl_event* end;
  bool nested_callbacks;
  bool blocking;
} callback_data;

void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status,
                                  void* user_data);

void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status,
                                    void* user_data);

int
main(int argc, char* argv[])
{

  bool use_callbacks = true;
  bool use_nested_callbacks = true;
  bool use_blocking = false;

  int numSelPlatform = 0;
  int numSelDevice = 0;
  int doUseCallbacks = 0;
  int doUseNestedCallbacks = 0;
  int doUseBlocking = 0;
  int use_type = 0;
  if (argc != 7) {
    printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use "
           "callbacks) (use nested callbacks) (use blocking)\n",
           argv[0]);
    exit(EXIT_FAILURE);
  } else {
    numSelPlatform = atoi(argv[1]);
    numSelDevice = atoi(argv[2]);
    use_type = atoi(argv[3]);
    doUseCallbacks = atoi(argv[4]);
    doUseNestedCallbacks = atoi(argv[5]);
    doUseBlocking = atoi(argv[6]);
  }

  cl_event end;

  uint size = 1024;
  int* a_v = (int*)malloc(size * sizeof(int));
  int* b_v = (int*)malloc(size * sizeof(int));
  int* c_v = (int*)malloc(size * sizeof(int));
  for (size_t i = 0; i < size; i++) {
    a_v[i] = i;
    b_v[i] = i + 1;
    c_v[i] = 0;
  }

  const char* kernel_str = file_read("src/kernel.cl");

  use_callbacks = doUseCallbacks;
  use_nested_callbacks = doUseNestedCallbacks;
  use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE;

  cl_int st;
  cl_int err;

  int len = 256;
  char buflog[len];

  cl_uint numPlatforms = 0;
  st = clGetPlatformIDs(0, NULL, &numPlatforms);
  cl_platform_id* platforms = NULL;
  platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));

  st = clGetPlatformIDs(numPlatforms, platforms, NULL);
  printf("platforms: %d (%d)\n", numPlatforms, st);

  cl_uint selPlatform = numSelPlatform; // 1;

  numPlatforms = 1;
  cl_platform_id platform = platforms[selPlatform];

  clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL);
  if (buflog != NULL) {
    printf("platform name: %s\n", buflog);
  }

  cl_uint numDevices = 0;
  st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
  printf("num devices: %d (%d)\n", numDevices, st);
  if (st != CL_SUCCESS) {
    /* printf("explain error: %s\n", clErrorString(st)); */
    printf("error: %d\n", st);
  }
  cl_device_id* devices = NULL;
  devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

  st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
  printf("devices: %d (%d)\n", numDevices, st);

  // Context
  cl_context context;
  context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err);
  printf("context (%d)\n", err);

  // Select device
  cl_uint selDevice = numSelDevice; // 0;
  numDevices = 1;                   // clBuildProgram
  cl_device_id device = devices[selDevice];

  // Device Info
  clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL);
  if (buflog != NULL) {
    printf("device name: %s\n", buflog);
  }

  cl_device_type type;
  clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  printf("device type: %s\n", cl_device_type_to_str(type));

  // events
  cl_event ev_kernel;

  // CommandQueue
  /* cl_command_queue_properties props; */
  cl_command_queue queue;
  queue = clCreateCommandQueue(context, device, 0, &err);
  printf("command queue (%d)\n", err);

  // CreateBuffer
  cl_mem buf_a;
  cl_mem buf_b;
  cl_mem buf_c;

  ocl_type_e_t ocl_type;
  if (use_type == 0) {
    ocl_type = OCL_TYPE_CPU;
    printf("mode CPU\n");
  } else if (use_type == 1) {
    ocl_type = OCL_TYPE_GPU;
    printf("mode GPU\n");
  } else if (use_type == 2) {
    ocl_type = OCL_TYPE_IGPU;
    printf("mode IGPU\n");
  } else if (use_type == 3) {
    ocl_type = OCL_TYPE_ACC;
    printf("mode ACC\n");
  }

  /* cl_mem buf_x; */
  switch (ocl_type) {
    case OCL_TYPE_IGPU:
      buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                             a_v, &err);
      /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
       * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
      /*                      Acpy, &err); */
      break;
    case OCL_TYPE_GPU:
      buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                             a_v, &err);
      break;
    case OCL_TYPE_ACC:
      buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), a_v, &err);
      break;
    case OCL_TYPE_CPU:
      buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), a_v, &err);
      break;
    default:
      printf("no ocl_type defined\n");
      exit(EXIT_FAILURE);
      break;
  }

  printf("create buffer a (%d)\n", err);
  if (err != CL_SUCCESS) {
    /* printf("create buffer error: %s\n", clErrorString(err)); */
    printf("create buffer error: %d\n", err);
  }

  switch (ocl_type) {
    case OCL_TYPE_IGPU:
      buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                             b_v, &err);
      break;
    case OCL_TYPE_GPU:
      buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                             b_v, &err);
      break;
    case OCL_TYPE_ACC:
      buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), b_v, &err);
      break;
    case OCL_TYPE_CPU:
      buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), b_v, &err);
      break;
    default:
      printf("no ocl_type defined\n");
      exit(EXIT_FAILURE);
      break;
  }

  printf("create buffer b (%d)\n", err);
  if (err != CL_SUCCESS) {
    printf("create buffer error: %d\n", err);
    /* printf("create buffer error: %s\n", clErrorString(err)); */
  }

  switch (ocl_type) {
    case OCL_TYPE_IGPU:
      buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                             c_v, &err);
      /* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols *
       * sizeof(int), */
      /*                        c_v, &err); */
      /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
       * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
      /*                      Acpy, &err); */
      break;
    case OCL_TYPE_GPU:
      buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                             c_v, &err);
      break;
    case OCL_TYPE_ACC:
      buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), c_v, &err);
      break;
    case OCL_TYPE_CPU:
      buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE |
                             CL_MEM_USE_HOST_PTR,
      /* buf_c = */
        /* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */
                       /* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */
                       size * sizeof(int), c_v, &err);
      break;
    default:
      printf("no ocl_type defined\n");
      exit(EXIT_FAILURE);
      break;
  }

  printf("create buffer c (%d)\n", err);
  if (err != CL_SUCCESS) {
    /* printf("create buffer error: %s\n", clErrorString(err)); */
    printf("create buffer error: %d\n", err);
  }
  /* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x,
   * &err); */
  /* printf("create buffer x (%d)\n", err); */

  // WriteBuffer
  /* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float),
   */
  /*                           Acpy, 0, NULL, NULL); */
  /* printf("write buffer Acpy - b_a (%d)\n", st); */
  /* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy,
   * 0, */
  /*                           NULL, NULL); */
  /* printf("write buffer bcpy - b_b (%d)\n", st); */

  // Create Program
  cl_program program;
  program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str,
                                      NULL, &err);
  printf("create program (%d)\n", err);

  // Build Program
  /* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL,
   * NULL, */
  /*                     NULL); */
  char* opts = "-Werror";
  st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL,
                      NULL);
  printf("build program (%d)\n", st);
  if (st != CL_SUCCESS) {
    /* printf("build status: %s\n", clErrorString(st)); */
    printf("build status: %d\n", st);
    char log[512];
    st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log,
                               NULL);
    printf("build info (%d)\n", st);
    if (st == CL_SUCCESS) {
      printf("%s\n", log);
    }
  }

  // Create Kernel
  cl_kernel kernel1;
  kernel1 = clCreateKernel(program, "kernel1", &st);
  printf("create kernel1 (%d)\n", st);
  /* cl_kernel kernel2; */
  /* kernel2 = clCreateKernel(program, "ocl1_2", &st); */
  /* printf("create kernel2 (%d)\n", st); */

  // workgroup size
  size_t dims = 1;
  size_t gws[] = { 1, 1, 1 };
  /* size_t gws[dims]; */
  gws[0] = size; // a_rows;
  /* gws[0] = 32; */
  /* size_t* lws = NULL; */
  /* size_t lws[dims]; */
  /* size_t lws[dims]; */
  /* size_t lws[dims] = NULL; */
  /* size_t lws[] = {0, 0, 0}; */
  size_t lws[] = { 128, 1, 1 };
  printf("gws {%lu, %lu, %lu}\n", gws[0], gws[1], gws[2]);
  if (lws != NULL) {
    printf("lws {%lu, %lu, %lu}\n", lws[0], lws[1], lws[2]);
  } else {
    printf("lws unspecified\n");
  }

  // Set Kernel Args
  st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a);
  printf("set arg %d (%d)\n", 0, st);
  st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b);
  printf("set arg %d (%d)\n", 1, st);
  /* printf("set kernel1 arg: %d (%d)\n", 0, st); */
  st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c);
  printf("set arg %d (%d)\n", 2, st);
  st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size);
  printf("set arg %d (%d)\n", 3, st);

  // Execute kernel
  st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
                              (const size_t*)lws, 0, NULL, &ev_kernel);
  /* (const size_t*)lws, 0, NULL, NULL); */
  /* printf("nd range kernel1 (%d %s)\n", st, clErrorString(st)); */
  printf("nd range kernel1 (%d)\n", st);

  end = clCreateUserEvent(context, &st);
  printf("create user event (%d)\n", st);

  callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));

  printf("c_v %p\n", (void*)c_v);

  user_data->queue = &queue;
  user_data->buf_c = &buf_c;
  user_data->c_v = c_v;
  user_data->size = size;
  user_data->end = &end;
  user_data->nested_callbacks = use_nested_callbacks;
  user_data->blocking = use_blocking;

  if (use_callbacks) {
    st =
      clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
    printf("set event callback (%d)\n", st);
  }
  /* printf("first: %2.5f\n", c_v[0]); */
  /* print_matrix_float_s_t("c", c); */
  // ReadBuffer
  /* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ,
   * 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */
  /* printf("read buffer c_v - buf_c (%d)\n", st); */
  /* printf("finish queue\n"); */
  /* clFinish(queue); */
  /* printf("finished queue\n"); */

  if (use_callbacks) {
    /* clWaitForCompletion(context); */

    printf("waiting for events\n");
    /* /\* cl_event events[] = {ev_kernel}; *\/ */
    cl_event events[] = { end };
    clWaitForEvents(1, events); // ev_kernel);
    printf("waited for events\n");

    clbWaitEvents(c_v);

  } else {
    printf("about to read the c buffer\n");
    st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int),
                             c_v, 0, NULL, NULL);
    printf("read buffer c_v - buf_c (%d)\n", st);
  }

  /* print_matrix("c_v", c_v, c_rows, c_cols); */

  /* printf("first: %2.5f\n", c_v[0]); */
  /* print_matrix_float_s_t("c", c); */
  free(user_data);

  clReleaseKernel(kernel1);
  /* clReleaseKernel(kernel2); */
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseMemObject(buf_a);
  clReleaseMemObject(buf_b);
  clReleaseMemObject(buf_c);
  /* clReleaseMemObject(b_x); */
  clReleaseContext(context);
  free(devices);
  free(platforms);

#define THRESHOLD 0
  // check
  printf("about to check (first: %d)\n", c_v[0]);
  for (size_t i = 0; i < size; i++) {
    if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) {
      printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d\n", i,
             a_v[i], i, b_v[i], i, c_v[i]);
      exit(EXIT_FAILURE);
    }
  }

  return EXIT_SUCCESS;
}

void CL_CALLBACK
callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
{
  printf("-- BEGIN callback read executed (%d)\n", ev_status);
  callback_data* cb_data = (callback_data*)user_data;
  /* cl_command_queue queue = *(cb_data->queue); */
  /* cl_mem buf_c = *(cb_data->buf_c); */
  int* c_v = cb_data->c_v;
  cl_event end = *(cb_data->end);
  /* int size = cb_data->size; */

  cl_int st;

  printf("c_v %p\n", (void*)c_v);
  printf("c_v[0] = %d\n", c_v[0]);

  /* c_v[1] = 1; */

  st = clSetUserEventStatus(end, CL_COMPLETE);
  printf("set user event status (%d)\n", st);
  // haz que salga el finish
  printf("-- END\n");
}

cl_event ev_read;

void CL_CALLBACK
callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
{
  printf("-- BEGIN callback kernel executed (%d)\n", ev_status);
  callback_data* cb_data = (callback_data*)user_data;
  cl_command_queue queue = *(cb_data->queue);
  cl_mem buf_c = *(cb_data->buf_c);
  int* c_v = cb_data->c_v;
  int size = cb_data->size;
  bool nested_callbacks = cb_data->nested_callbacks;
  bool blocking = cb_data->blocking;
  cl_event end = *(cb_data->end);


  printf("c_v %p\n", (void*)c_v);
  printf("c_v[0] = %d\n", c_v[0]);

  cl_int st;

  /* printf("about to flush\n"); */
  /* clFlush(queue); */
  /* printf("flushed\n"); */

  size_t offset = 0;
  /* size = size + 4; */
  printf("about to read the c buffer\n");
  printf("blocking %d\n", blocking);

  clb_events_waiting_device[clb_num_events_waiting] = 0;
  clb_events_init_read[clb_num_events_waiting] = 0;


  /* why it does not work? (blocking CL_TRUE) */
  st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),
                           c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
  ev_read = clb_events_waiting[clb_num_events_waiting - 1];
  printf("enqueue read buffer (%d)\n", st);
  /* size * sizeof(int), c_v, 0, NULL, NULL); */

  if (nested_callbacks) {
    st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
    printf("set event callback (%d)\n", st);
    /* st = clSetUserEventStatus(end, CL_COMPLETE); */
    /* printf("set user event status (%d)\n", st); */
  }
  /* c_v[1] = 1; */

  /* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */
  /* printf("event info (%d)\n", st); */

  /* int len = 512; */
  /* char buflog[len]; */
  /* cl_command_type; */
  /* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */
  /* if (buflog != NULL) { */
  /*   printf("- event: %s\n", buflog); */
  /* } */

  if (!nested_callbacks) {
    st = clSetUserEventStatus(end, CL_COMPLETE);
    printf("set user event status (%d)\n", st);

    /* printf("read buffer c_v - buf_c (%d)\n", st); */
  }
  printf("-- END\n");
}

现在,如果我选择Intel CPU作为设备:

./callback 0 1 0 1 1 0

有效:

platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz
device type: CL_DEVICE_TYPE_CPU
command queue (0)
mode CPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x1420030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x1420030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
-- BEGIN callback read executed (0)
c_v 0x1420030
c_v[0] = 1
set user event status (0)
-- END
waited for events
About to wait events: 1
3 = CL_QUEUED, 0 = CL_COMPLETE, 2 = CL_SUBMITTED, 1 = CL_RUNNING
cl_event CPU init      0  [0] = status 0 (ref 0x7f7568000a90)
CPU [0] = 1, [1] = 3, [2] = 5
about to check (first: 1)

现在,如果我选择英特尔IGPU(英特尔集成GPU):

./callback 0 0 2 1 1 0

冻结/挂起:

platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x18b7030
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x18b7030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END

如果我使用gdb并运行相同的测试,并执行C-c,我可以看到:

(gdb) r 0 0 2 1 1 0
Starting program: /callbacks/build/callback 0 0 2 1 1 0
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
[New Thread 0x7ffff4cd9700 (LWP 21291)]
platforms: 1 (0)
platform name: Intel(R) OpenCL
num devices: 2 (0)
devices: 2 (0)
[New Thread 0x7fffeede2700 (LWP 21292)]
[New Thread 0x7fffee5e0700 (LWP 21293)]
[New Thread 0x7fffee9e1700 (LWP 21294)]
context (0)
device name: Intel(R) HD Graphics
device type: CL_DEVICE_TYPE_GPU
command queue (0)
mode IGPU
create buffer a (0)
create buffer b (0)
create buffer c (0)
create program (0)
build program (0)
create kernel1 (0)
gws {1024, 1, 1}
lws {128, 1, 1}
set arg 0 (0)
set arg 1 (0)
set arg 2 (0)
set arg 3 (0)
nd range kernel1 (0)
create user event (0)
c_v 0x607030
[New Thread 0x7fffec827700 (LWP 21295)]
set event callback (0)
waiting for events
-- BEGIN callback kernel executed (0)
c_v 0x607030
c_v[0] = 0
about to read the c buffer
blocking 0
enqueue read buffer (0)
set event callback (0)
-- END
^C
Thread 1 "callback" received signal SIGINT, Interrupt.
0x00007ffff730a756 in pthread_cond_wait@@GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
(gdb) bt
#0  0x00007ffff730a756 in pthread_cond_wait@@GLIBC_2.3.2 () from /usr/lib/libpthread.so.0
#1  0x00007ffff64c635b in ?? () from /opt/intel/opencl/libintelocl.so
#2  0x00007ffff648c63a in ?? () from /opt/intel/opencl/libintelocl.so
#3  0x00007ffff647b5d1 in ?? () from /opt/intel/opencl/libintelocl.so
#4  0x00007ffff63f3e75 in clWaitForEvents () from /opt/intel/opencl/libintelocl.so
#5  0x00007ffff6edca43 in ?? () from /opt/intel/opencl/libIntelOpenCL.so
#6  0x000000000040237e in main (argc=7, argv=0x7fffffffdc58) at ./src/callback.c:532

正如您在第一个执行示例(CPU)中所看到的,它应该出现两个回调(两个BEGIN / END对)。在HD Graphics GPU的情况下,它在第一次回调后挂起(只有一个BEGIN / END对)。

为什么吗

(gdb显示在intel opencl驱动程序的pthread_cond_wait中冻结了。

任何人都可以解释回调/事件和主机线程的行为吗? (最佳实践,如何避免死锁)

我需要精细的控制和最快的性能,它看起来像是回调,但它们有奇怪的行为......

预期行为(仅发生在CPU中,而不发生在IGPU中): 1.主机创建用户事件。然后,主机调用EnqueueKernelNDRange(向量添加)并等待用户事件(WaitForEvents)。当内核完成时,它会触发回调“callback_kernel”。 2.这个“callback_kernel”调用EnqueueReadBuffer非阻塞,当它完成时触发回调“callback_read”。 3.“callback_read”设置CL_COMPLETE用户事件。 4.主机在WaitForEvents之后继续填充内容(缓冲区读取)。

1 个答案:

答案 0 :(得分:0)

您的问题如下:

/* why it does not work? (blocking CL_TRUE) */
st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);

在回调函数中,您尝试发出对clEnqueueReadBuffer的阻塞调用,这在OpenCL中是不允许的。您应该从以下链接检查规范说明中不允许的功能。

https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clSetEventCallback.html

我还建议您阅读驱动程序支持的规范中的整个回调部分,我在这里添加了最新的OpenCL规范2.2的相应部分。

https://www.khronos.org/registry/OpenCL/specs/opencl-2.2.pdf#page=197