我试图在同一上下文中同时在两个GPU设备上运行相同的内核。
我遇到了一个麻烦,在尝试分析事件对象时,第二个命令队列的值为-7(事件对象不可用)。
当我等待事件时,它将出现-7错误。这似乎只发生在命令队列2中。
知道为什么吗?任何帮助将不胜感激。
附加代码。
void *bytes;
float *zeropad;
float *output_f;
void *outputbytes;
int ret;
ret = posix_memalign(&bytes, total_alignment_requirement, cshape[level][1]*(size+2)*(size+2)*sizeof(float));
zeropad = (float *)bytes;
//float *output_f = (float *)calloc(cshape[level][0]*size*size,sizeof(float));
//SR assigning aligned memory
ret = posix_memalign(&outputbytes, total_alignment_requirement, cshape[level][1]*(size+2)*(size+2)*sizeof(float));
output_f = (float *)outputbytes;
unsigned int total=0;
//prepare matrix for OpenCL
padding_input(matrix,zeropad,size,in_depth);
cl::Buffer zeropad_buf(openclObjects.context,CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR,(size+2)*(size+2)*cshape[level][1]*sizeof(float),zeropad);
cl::Buffer output_buf(openclObjects.context,CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR ,cshape[level][0]*size*size*sizeof(float),output_f);
cl::Buffer bs(openclObjects.context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,cshape[level][0]*sizeof(float),bc[level]);
// SR using sub buffers only zeropad_buf and output_bf and to chunk up the buffer and submit the kernels twice...once to each device
//Creating sub_buffers for zeropad_buf
size_t zeropad_buf_size = (size+2)*(size+2)*cshape[level][1]*sizeof(float);
size_t output_buf_size = cshape[level][0]*size*size*sizeof(float);
size_t zeropad_split_pos = zeropad_buf_size / 2;
zeropad_split_pos -= zeropad_split_pos % total_alignment_requirement;
cl_buffer_region zero_rgn_4core = {0, zeropad_split_pos};
cl_buffer_region zero_rgn_2core = {zeropad_split_pos, zeropad_buf_size - zeropad_split_pos};
/*
cl_buffer_region zero_rgn_4core = {0, zeropad_buf_size/2};
cl_buffer_region zero_rgn_2core = {zeropad_buf_size/2, zeropad_buf_size/2};
*/
cl_buffer_region output_rgn_4core = {0, output_buf_size/2};
cl_buffer_region output_rgn_2core = {output_buf_size/2, output_buf_size/2};
cl::Buffer zeropad_buf_4Core = zeropad_buf.createSubBuffer(CL_MEM_READ_ONLY,CL_BUFFER_CREATE_TYPE_REGION, &zero_rgn_4core);
std::cout<<"zero_pad sub-buffer region 1 created"<<std::endl;
cl::Buffer zeropad_buf_2Core = zeropad_buf.createSubBuffer(CL_MEM_READ_ONLY,CL_BUFFER_CREATE_TYPE_REGION, &zero_rgn_2core);
std::cout<<"zero_pad sub-buffer region 2 created"<<std::endl;
cl::Buffer output_buf_4Core = output_buf.createSubBuffer(CL_MEM_READ_WRITE,CL_BUFFER_CREATE_TYPE_REGION, &output_rgn_4core);
cl::Buffer output_buf_2Core = output_buf.createSubBuffer(CL_MEM_READ_WRITE,CL_BUFFER_CREATE_TYPE_REGION, &output_rgn_2core);
cl::NDRange global(global_x, global_y, global_y);
cl::NDRange local(1, group_size, group_size);
//cl::Event evt[2];//SR
//SR use a vector events
std::vector<cl::Event> events;
cl::Event evt1, evt2;
//SR Kernel after sub buffering - 4 core
openclObjects.conv_gpu.setArg<cl::Memory>(0, zeropad_buf_4Core);
openclObjects.conv_gpu.setArg<cl::Memory>(1, conv_weights[level]);
openclObjects.conv_gpu.setArg<cl::Memory>(2, output_buf_4Core);
openclObjects.conv_gpu.setArg<cl::Memory>(3, bs);
openclObjects.conv_gpu.setArg<int>(4, size+2);
openclObjects.conv_gpu.setArg<int>(5, cshape[level][1]);
openclObjects.conv_gpu.setArg<int>(6, size);
openclObjects.conv_gpu.setArg<int>(7, cshape[level][0]);
openclObjects.conv_gpu.setArg<int>(8, CONV_SIZE);
cl_int err=openclObjects.queue[0].enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL, &evt1); //SR
events.push_back(evt1);
// cl_int err = openclObjects.queue.enqueueNDRangeKernel(openclObjects.conv_gpu,cl :: NullRange,全局,局部,NULL)
//SR Kernel after sub buffering - 2 core
openclObjects.conv_gpu.setArg<cl::Memory>(0, zeropad_buf_2Core);
openclObjects.conv_gpu.setArg<cl::Memory>(1, conv_weights[level]);
openclObjects.conv_gpu.setArg<cl::Memory>(2, output_buf_2Core);
openclObjects.conv_gpu.setArg<cl::Memory>(3, bs);
openclObjects.conv_gpu.setArg<int>(4, size+2);
openclObjects.conv_gpu.setArg<int>(5, cshape[level][1]);
openclObjects.conv_gpu.setArg<int>(6, size);
openclObjects.conv_gpu.setArg<int>(7, cshape[level][0]);
openclObjects.conv_gpu.setArg<int>(8, CONV_SIZE);
//SR Added for CQ2 (2 Core GPU)
err=openclObjects.queue[1].enqueueNDRangeKernel( openclObjects.conv_gpu, cl::NullRange, global, local, NULL, &evt2);
events.push_back(evt2);
std::cout<<"Enqueue CQ2"<<std::endl;
//get event info
cl::CommandQueue CQ;
cl::Device CQ_device;
evt2.getInfo(CL_EVENT_COMMAND_QUEUE,&CQ);
CQ.getInfo(CL_QUEUE_DEVICE, &CQ_device);
std::cout<<"New Code"<<std::endl;
std::cout<<"Event attached to COmmand Q2"<<std::endl;
std::cout<<"Device Name in Command Queue 1: "<<CQ_device.getInfo<CL_DEVICE_NAME>()<<std::endl;
std::cout<<"Device Vendor: "<<CQ_device.getInfo<CL_DEVICE_VENDOR>()<<std::endl;
std::cout<<"Device max CU: "<<CQ_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()<<std::endl;
cl::Event::waitForEvents(events);
//openclObjects.queue[0].finish(); //SR
std::cout<<"Command Queue 1 complete"<<std::endl;
//openclObjects.queue[1].finish();//SR added for CQ2
std::cout<<"Command Queue 2 complete"<<std::endl;
// printf(“ global_x,global_y,global_y,错误:%d%d%d%d \ n”,global_x,global_y,global_y,err); // printf(“%d \ n”,err);
cl_ulong elapsed=0;
cl_ulong elapsed1=0; //SR calculate elapse per command queue
cl_ulong elapsed0=0; //SR calculate elapse per command queue
elapsed0 =evt1.getProfilingInfo<CL_PROFILING_COMMAND_END>()-evt1.getProfilingInfo<CL_PROFILING_COMMAND_START>(); //SR
std::cout<<"Profile Info: Command Queue 1"<<std::endl;
elapsed1 =evt2.getProfilingInfo<CL_PROFILING_COMMAND_END>()-evt2.getProfilingInfo<CL_PROFILING_COMMAND_START>(); //SR
std::cout<<"Profile Info: Command Queue 2"<<std::endl;
//std::cout<<"elapsed CQ0"<<elapsed0<<std::endl; //SR
//std::cout<<"elapsed CQ1"<<elapsed1<<std::endl; //SR
elapsed = elapsed0+elapsed1;
答案 0 :(得分:0)
尝试取消注释openclObjects.queue [0] .finish();和openclObjects.queue [1] .finish();
您也可以使用冲洗代替完成。