我有一个设置,我在多个设备之间共享一个内存对象。我试图多次调用内核来积累一些值。在CPU上,这工作正常。在GPU上,后续调用没有看到上一次调用的结果。这是带有调试语句的内核。
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void mgrid_sum(__global double *ar,
__global double *ap,
__global double *az,
__global const double *temp_ar,
__global const double *temp_ap,
__global const double *temp_az,
double current) {
size_t i = get_global_id(0);
if (i == 0) {
printf("1 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
}
ar[i] += temp_ar[i]*current;
ap[i] += temp_ap[i]*current;
az[i] += temp_az[i]*current;
if (i == 0) {
printf("2 %6i %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e %12.5e\n", i, ar[i], ap[i], az[i], temp_ar[i], temp_ap[i], temp_az[i], current);
}
}
我有两组记忆对象。我的temp_
内存缓冲区在每次内核调用之前加载新值。由于每个设备都在一个独立的内存块上运行,我希望在完成所有内核调用之后,我不需要同步内存缓冲区。主持人代码是
// Define memory objects. The host only needs these values at the very end.
a_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_p = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
a_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, buffer_size, NULL, NULL);
// Define temp memory objects. This will only be written to by the host and read by the kernel.
cl_mem temp_ar = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_ap = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
cl_mem temp_az = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, buffer_size, NULL, NULL);
// The first six kernel arguments don't change.
clSetKernelArg(sum, 0, sizeof(cl_mem), &a_r);
clSetKernelArg(sum, 1, sizeof(cl_mem), &a_p);
clSetKernelArg(sum, 2, sizeof(cl_mem), &a_z);
clSetKernelArg(sum, 3, sizeof(cl_mem), &temp_ar);
clSetKernelArg(sum, 4, sizeof(cl_mem), &temp_ap);
clSetKernelArg(sum, 5, sizeof(cl_mem), &temp_az);
size_t totalsize = 0;
for (device_info *device : cpu_devices) {
size_t worksize;
clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
device->max_work_group_size = worksize;
totalsize += worksize;
}
for (device_info *device : gpu_devices) {
size_t worksize;
clGetKernelWorkGroupInfo(sum, device->id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &worksize, NULL);
device->max_work_group_size = worksize;
totalsize += worksize;
}
size_t n_chunks = array_size/totalsize;
size_t remainder = array_size%totalsize;
size_t offset = 0;
for (device_info *device : gpu_devices) {
device->m_offset = offset;
device->m_chunk = n_chunks*device->max_work_group_size;
offset += device->m_chunk;
}
for (device_info *device : cpu_devices) {
device->m_offset = offset;
device->m_chunk = n_chunks*device->max_work_group_size;
offset += device->m_chunk;
}
cpu_devices.back()->m_chunk += remainder;
cl_event event;
std::vector<cl_event> buffer_events;
std::vector<cl_event> unmap_events;
// Make sure the a_ memory starts with zero.
const cl_char pattern = 0;
for (device_info *device : gpu_devices) {
offset = device->m_offset*sizeof(cl_double);
size_t fill_size = device->m_chunk*sizeof(cl_double);
clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
}
for (device_info *device : cpu_devices) {
offset = device->m_offset*sizeof(cl_double);
size_t fill_size = device->m_chunk*sizeof(cl_double);
clEnqueueFillBuffer(device->queue, a_r, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_p, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
clEnqueueFillBuffer(device->queue, a_z, &pattern, sizeof(cl_char), offset*sizeof(cl_char), fill_size*sizeof(cl_char), 0, NULL, &event);
buffer_events.push_back(event);
}
// For each iteration load the value of current set the temp_ buffer value.
// Temp buffer values are obtained by mapping the memory object to the host
// and writing to the memory object directly.
for (size_t i = 0, e = extcur.size(); i < e; i++) {
clSetKernelArg(sum, 6, sizeof(cl_double), extcur.data() + i);
cl_command_queue cpu_queue = cpu_devices.back()->queue;
std::stringstream ss_ar;
ss_ar << "ar_" << std::setfill('0') << std::setw(3) << i + 1;
std::stringstream ss_ap;
ss_ap << "ap_" << std::setfill('0') << std::setw(3) << i + 1;
std::stringstream ss_az;
ss_az << "az_" << std::setfill('0') << std::setw(3) << i + 1;
cl_double *temp_buffer;
nc_inq_varid(ncid, ss_ar.str().c_str(), &temp_varid);
temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ar, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
nc_get_var(ncid, temp_varid, temp_buffer);
clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
buffer_events.push_back(event);
nc_inq_varid(ncid, ss_ap.str().c_str(), &temp_varid);
temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_ap, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
nc_get_var(ncid, temp_varid, temp_buffer);
buffer_events.push_back(event);
clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
buffer_events.push_back(event);
nc_inq_varid(ncid, ss_az.str().c_str(), &temp_varid);
temp_buffer = static_cast<cl_double *>(clEnqueueMapBuffer(cpu_queue, temp_az, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, buffer_size, static_cast<cl_uint>(last_events.size()), last_events.data(), NULL, NULL));
nc_get_var(ncid, temp_varid, temp_buffer);
buffer_events.push_back(event);
clEnqueueUnmapMemObject(cpu_queue, temp_ar, temp_buffer, 0, NULL, &event);
buffer_events.push_back(event);
for (cl_event event : last_events) {
clReleaseEvent(event);
}
last_events.clear();
// Call kernels. These should wait until the memory objects are fully written.
for (device_info *device : gpu_devices) {
offset = device->m_offset;
size_t fill_size = device->m_chunk;
clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
last_events.push_back(event);
offset += fill_size;
}
for (device_info *device : cpu_devices) {
offset = device->m_offset;
size_t fill_size = device->m_chunk;
clEnqueueNDRangeKernel(device->queue, sum, 1, &offset, &fill_size, NULL, static_cast<cl_uint>(buffer_events.size()), buffer_events.data(), &event);
last_events.push_back(event);
}
for (cl_event event : buffer_events) {
clReleaseEvent(event);
}
buffer_events.clear();
}
我期待在第一次内核调用之后,它会一直停留在设备上,直到。但是,第一个GPU设备的输出显示前一个内核调用写入的值未被保留。我的调试语句的输出显示了这个
1 0 0.00000e+00 0.00000e+00 0.00000e+00 1.17641e-05 2.32558e-05 -3.04041e-05 4.20712e+03
2 0 4.94930e-02 9.78400e-02 -1.27913e-01 1.17641e-05 2.32558e-05 -3.04041e-05 4.20712e+03
# The first three values should be 4.94930e-02 9.78400e-02 -1.27913e-01 not zero.
1 0 0.00000e+00 0.00000e+00 0.00000e+00 -1.33115e-21 1.44679e-07 0.00000e+00 1.11479e+03
2 0 -1.48396e-18 1.61287e-04 0.00000e+00 -1.33115e-21 1.44679e-07 0.00000e+00 1.11479e+03
1 0 0.00000e+00 0.00000e+00 0.00000e+00 2.94903e-24 -3.56183e-06 0.00000e+00 1.46293e+04
2 0 4.31422e-20 -5.21071e-02 0.00000e+00 2.94903e-24 -3.56183e-06 0.00000e+00 1.46293e+04
# This call correctly retained the values of the previous call.
1 0 4.31422e-20 -5.21071e-02 0.00000e+00 6.39571e-22 1.25141e-05 0.00000e+00 -9.38901e-02
2 0 4.30821e-20 -5.21083e-02 0.00000e+00 6.39571e-22 1.25141e-05 0.00000e+00 -9.38901e-02
1 0 4.31422e-20 -5.21071e-02 0.00000e+00 8.62591e-22 -1.69898e-06 0.00000e+00 0.00000e+00
2 0 4.31422e-20 -5.21071e-02 0.00000e+00 8.62591e-22 -1.69898e-06 0.00000e+00 0.00000e+00
1 0 4.31422e-20 -5.21071e-02 0.00000e+00 -1.79097e-05 -3.44107e-22 4.79035e-05 -4.39945e+02
2 0 7.87928e-03 -5.21071e-02 -2.10749e-02 -1.79097e-05 -3.44107e-22 4.79035e-05 -4.39945e+02
1 0 4.31422e-20 -5.21071e-02 0.00000e+00 -3.54518e-24 1.54271e-07 0.00000e+00 7.54929e+03
2 0 1.63786e-20 -5.09424e-02 0.00000e+00 -3.54518e-24 1.54271e-07 0.00000e+00 7.54929e+03
1 0 1.63786e-20 -5.09424e-02 0.00000e+00 1.77797e-08 -3.33409e-08 -6.29109e-09 1.03658e+03
2 0 1.84302e-05 -5.09770e-02 -6.52124e-06 1.77797e-08 -3.33409e-08 -6.29109e-09 1.03658e+03
1 0 1.63786e-20 -5.09424e-02 0.00000e+00 -1.47516e-05 8.13733e-06 2.78991e-06 0.00000e+00
2 0 1.63786e-20 -5.09424e-02 0.00000e+00 -1.47516e-05 8.13733e-06 2.78991e-06 0.00000e+00
1 0 1.63786e-20 -5.09424e-02 0.00000e+00 5.12265e-12 1.55708e-07 -1.05426e-11 0.00000e+00
2 0 1.63786e-20 -5.09424e-02 0.00000e+00 5.12265e-12 1.55708e-07 -1.05426e-11 0.00000e+00
它是否在调用之间进行隐式同步?
更新
以下是有关设备的信息。
Platform Name : Apple
Device Name : Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
Device Type : CPU
Supports Double : True
Address Bits : 64
Max Work Size : 1
Extensions : cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing
cl_khr_gl_event
cl_khr_fp64
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_byte_addressable_store
cl_khr_int64_base_atomics
cl_khr_int64_extended_atomics
cl_khr_3d_image_writes
cl_khr_image2d_from_buffer
cl_APPLE_fp64_basic_ops
cl_APPLE_fixed_alpha_channel_orders
cl_APPLE_biased_fixed_point_image_formats
cl_APPLE_command_queue_priority
m_offset : 731392
m_chunk : 3080
Platform Name : Apple
Device Name : AMD Radeon R9 M370X Compute Engine
Device Type : GPU
Supports Double : True
Address Bits : 32
Max Work Size : 256
Extensions : cl_APPLE_SetMemObjectDestructor
cl_APPLE_ContextLoggingFunctions
cl_APPLE_clut
cl_APPLE_query_kernel_names
cl_APPLE_gl_sharing
cl_khr_gl_event
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_byte_addressable_store
cl_khr_image2d_from_buffer
cl_khr_depth_images
cl_APPLE_command_queue_priority
cl_APPLE_command_queue_select_compute_units
cl_khr_fp64
m_offset : 0
m_chunk : 731392