Question

我有一个设置，我在多个设备之间共享一个内存对象。我试图多次调用内核来积累一些值。在CPU上，这工作正常。在GPU上，后续调用没有看到上一次调用的结果。这是带有调试语句的内核。

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void mgrid_sum(__global double *ar,            
                        __global double *ap,           
                        __global double *az,            
                        __global const double *temp_ar,
                        __global const double *temp_ap,
                        __global const double *temp_az,
                        double current) {
    size_t i = get_global_id(0);
    if (i == 0) {
        printf("1 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
    }
    ar[i] += temp_ar[i]*current;
    ap[i] += temp_ap[i]*current;
    az[i] += temp_az[i]*current;
    if (i == 0) {
        printf("2 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
    }
}

我有两组记忆对象。我的temp_内存缓冲区在每次内核调用之前加载新值。由于每个设备都在一个独立的内存块上运行，我希望在完成所有内核调用之后，我不需要同步内存缓冲区。主持人代码是

// Define memory objects. The host only needs these values at the very end.
a_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_p = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);

// Define temp memory objects. This will only be written to by the host and read by the kernel.
cl_mem temp_ar = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_ap = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_az = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);

// The first six kernel arguments don't change.
clSetKernelArg(sum, 0, sizeof(cl_mem), &a_r);
clSetKernelArg(sum, 1, sizeof(cl_mem), &a_p);
clSetKernelArg(sum, 2, sizeof(cl_mem), &a_z);
clSetKernelArg(sum, 3, sizeof(cl_mem), &temp_ar);
clSetKernelArg(sum, 4, sizeof(cl_mem), &temp_ap);
clSetKernelArg(sum, 5, sizeof(cl_mem), &temp_az);

size_t totalsize = 0;
for (device_info *device : cpu_devices) {
    size_t worksize;
    clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
    device->max_work_group_size = worksize;
    totalsize += worksize;
}
for (device_info *device : gpu_devices) {
    size_t worksize;
    clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
    device->max_work_group_size = worksize;
    totalsize += worksize;
}

size_t n_chunks = array_size/totalsize;
size_t remainder = array_size%totalsize;

size_t offset = 0;
for (device_info *device : gpu_devices) {
    device->m_offset = offset;
    device->m_chunk = n_chunks*device->max_work_group_size;
    offset += device->m_chunk;
}
for (device_info *device : cpu_devices) {
    device->m_offset = offset;
    device->m_chunk = n_chunks*device->max_work_group_size;
    offset += device->m_chunk;
}
cpu_devices.back()->m_chunk += remainder;

cl_event event;
std::vector<cl_event> buffer_events;
std::vector<cl_event> unmap_events;

// Make sure the a_ memory starts with zero.
const cl_char pattern = 0;
for (device_info *device : gpu_devices) {
    offset = device->m_offset*sizeof(cl_double);
    size_t fill_size = device->m_chunk*sizeof(cl_double);
    clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
}
for (device_info *device : cpu_devices) {
    offset = device->m_offset*sizeof(cl_double);
    size_t fill_size = device->m_chunk*sizeof(cl_double);
    clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
    clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
    buffer_events.push_back(event);
}

// For each iteration load the value of current set the temp_ buffer value.
// Temp buffer values are obtained by mapping the memory object to the host 
// and writing to the memory object directly.
for (size_t i = 0, e = extcur.size(); i < e; i++) {
    clSetKernelArg(sum, 6, sizeof(cl_double), extcur.data() + i);

    cl_command_queue cpu_queue = cpu_devices.back()->queue;

    std::stringstream ss_ar;
    ss_ar << "ar_" << std::setfill('0') << std::setw(3) << i + 1;
    std::stringstream ss_ap;
    ss_ap << "ap_" << std::setfill('0') << std::setw(3) << i + 1;
    std::stringstream ss_az;
    ss_az << "az_" << std::setfill('0') << std::setw(3) << i + 1;

    cl_double *temp_buffer;
    nc_inq_varid(ncid, ss_ar.str().c_str(), &temp_varid);
    temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ar, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
    nc_get_var(ncid, temp_varid, temp_buffer);
    clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
    buffer_events.push_back(event);

    nc_inq_varid(ncid, ss_ap.str().c_str(), &temp_varid);
    temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ap, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
    nc_get_var(ncid, temp_varid, temp_buffer);
    buffer_events.push_back(event);
    clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
    buffer_events.push_back(event);

    nc_inq_varid(ncid, ss_az.str().c_str(), &temp_varid);
    temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_az, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
    nc_get_var(ncid, temp_varid, temp_buffer);
    buffer_events.push_back(event);
    clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
    buffer_events.push_back(event);

    for (cl_event event : last_events) {
        clReleaseEvent(event);
    }
    last_events.clear();

    // Call kernels. These should wait until the memory objects are fully written.
    for (device_info *device : gpu_devices) {
        offset = device->m_offset;
        size_t fill_size = device->m_chunk;
        clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
        last_events.push_back(event);
        offset += fill_size;
    }
    for (device_info *device : cpu_devices) {
        offset = device->m_offset;
        size_t fill_size = device->m_chunk;
        clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
        last_events.push_back(event);
    }

    for (cl_event event : buffer_events) {
        clReleaseEvent(event);
    }
    buffer_events.clear();
}

我期待在第一次内核调用之后，它会一直停留在设备上，直到。但是，第一个GPU设备的输出显示前一个内核调用写入的值未被保留。我的调试语句的输出显示了这个

1      0  0.00000e+00  0.00000e+00  0.00000e+00  1.17641e-05  2.32558e-05 -3.04041e-05  4.20712e+03
2      0  4.94930e-02  9.78400e-02 -1.27913e-01  1.17641e-05  2.32558e-05 -3.04041e-05  4.20712e+03

# The first three values should be 4.94930e-02  9.78400e-02 -1.27913e-01 not zero.
1      0  0.00000e+00  0.00000e+00  0.00000e+00 -1.33115e-21  1.44679e-07  0.00000e+00  1.11479e+03
2      0 -1.48396e-18  1.61287e-04  0.00000e+00 -1.33115e-21  1.44679e-07  0.00000e+00  1.11479e+03

1      0  0.00000e+00  0.00000e+00  0.00000e+00  2.94903e-24 -3.56183e-06  0.00000e+00  1.46293e+04
2      0  4.31422e-20 -5.21071e-02  0.00000e+00  2.94903e-24 -3.56183e-06  0.00000e+00  1.46293e+04

# This call correctly retained the values of the previous call.
1      0  4.31422e-20 -5.21071e-02  0.00000e+00  6.39571e-22  1.25141e-05  0.00000e+00 -9.38901e-02
2      0  4.30821e-20 -5.21083e-02  0.00000e+00  6.39571e-22  1.25141e-05  0.00000e+00 -9.38901e-02

1      0  4.31422e-20 -5.21071e-02  0.00000e+00  8.62591e-22 -1.69898e-06  0.00000e+00  0.00000e+00
2      0  4.31422e-20 -5.21071e-02  0.00000e+00  8.62591e-22 -1.69898e-06  0.00000e+00  0.00000e+00

1      0  4.31422e-20 -5.21071e-02  0.00000e+00 -1.79097e-05 -3.44107e-22  4.79035e-05 -4.39945e+02
2      0  7.87928e-03 -5.21071e-02 -2.10749e-02 -1.79097e-05 -3.44107e-22  4.79035e-05 -4.39945e+02

1      0  4.31422e-20 -5.21071e-02  0.00000e+00 -3.54518e-24  1.54271e-07  0.00000e+00  7.54929e+03
2      0  1.63786e-20 -5.09424e-02  0.00000e+00 -3.54518e-24  1.54271e-07  0.00000e+00  7.54929e+03

1      0  1.63786e-20 -5.09424e-02  0.00000e+00  1.77797e-08 -3.33409e-08 -6.29109e-09  1.03658e+03
2      0  1.84302e-05 -5.09770e-02 -6.52124e-06  1.77797e-08 -3.33409e-08 -6.29109e-09  1.03658e+03

1      0  1.63786e-20 -5.09424e-02  0.00000e+00 -1.47516e-05  8.13733e-06  2.78991e-06  0.00000e+00
2      0  1.63786e-20 -5.09424e-02  0.00000e+00 -1.47516e-05  8.13733e-06  2.78991e-06  0.00000e+00

1      0  1.63786e-20 -5.09424e-02  0.00000e+00  5.12265e-12  1.55708e-07 -1.05426e-11  0.00000e+00
2      0  1.63786e-20 -5.09424e-02  0.00000e+00  5.12265e-12  1.55708e-07 -1.05426e-11  0.00000e+00

它是否在调用之间进行隐式同步？

更新

以下是有关设备的信息。

Platform Name   : Apple
Device Name     : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device Type     : CPU
Supports Double : True
Address Bits    : 64
Max Work Size   : 1
Extensions      : cl_APPLE_SetMemObjectDestructor  
                  cl_APPLE_ContextLoggingFunctions 
                  cl_APPLE_clut 
                  cl_APPLE_query_kernel_names 
                  cl_APPLE_gl_sharing 
                  cl_khr_gl_event 
                  cl_khr_fp64             
                  cl_khr_global_int32_base_atomics 
                  cl_khr_global_int32_extended_atomics 
                  cl_khr_local_int32_base_atomics  
                  cl_khr_local_int32_extended_atomics 
                  cl_khr_byte_addressable_store 
                  cl_khr_int64_base_atomics 
                  cl_khr_int64_extended_atomics 
                  cl_khr_3d_image_writes 
                  cl_khr_image2d_from_buffer 
                  cl_APPLE_fp64_basic_ops 
                  cl_APPLE_fixed_alpha_channel_orders 
                  cl_APPLE_biased_fixed_point_image_formats 
                  cl_APPLE_command_queue_priority
m_offset        : 731392
m_chunk         : 3080

Platform Name   : Apple
Device Name     : AMD Radeon R9 M370X Compute Engine
Device Type     : GPU
Supports Double : True
Address Bits    : 32
Max Work Size   : 256
Extensions      : cl_APPLE_SetMemObjectDestructor 
                  cl_APPLE_ContextLoggingFunctions 
                  cl_APPLE_clut 
                  cl_APPLE_query_kernel_names 
                  cl_APPLE_gl_sharing 
                  cl_khr_gl_event 
                  cl_khr_global_int32_base_atomics 
                  cl_khr_global_int32_extended_atomics 
                  cl_khr_local_int32_base_atomics            
                  cl_khr_local_int32_extended_atomics 
                  cl_khr_byte_addressable_store 
                  cl_khr_image2d_from_buffer 
                  cl_khr_depth_images 
                  cl_APPLE_command_queue_priority 
                  cl_APPLE_command_queue_select_compute_units 
                  cl_khr_fp64
m_offset        : 0
m_chunk         : 731392

Answer 1

我弄清楚发生了什么。我需要向clFlush添加显式调用以在队列中使用事件。 clFlush州的文档：

要将引用命令队列中排队的命令的事件对象用作要在其他命令队列中排队的命令等待的事件对象，应用程序必须调用clFlush或执行隐式刷新的任何阻塞命令命令队列，其中引用这些事件对象的命令已入队。

关于cl_event的文档中并未明确这一点。

内核不保留对全局cl_mem对象的写入

1 个答案: