我最近开始使用OpenCL(英特尔SDK,i7700)。我试图实现一个简单的k-means算法,虽然它完全正常,但主机程序在尝试释放输出缓冲区时总是崩溃(clReleaseMemObject(output_z_buf);)。释放其他缓冲区工作得很好。我整天都在盯着代码,但我无法弄清楚我做错了什么。有想法的人吗?
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#ifdef __APPLE__
#include "OpenCL/opencl.h"
#else
#include <CL\cl.h>
#endif
using namespace std;
static void CheckForError(int success);
static void display_device_info(cl_device_id device);
const char* programPath = "kmeans.cl";
int main()
{
unsigned platformIDCount = 0;
unsigned selectedPlatform = 0;
unsigned deviceIDCount = 0;
unsigned selectedDevice = 0;
size_t stringSize = 0;
string stringResult = "";
cl_event kernel_event;
cl_event finish_event;
int success = CL_SUCCESS;
// Get amount of available platforms
clGetPlatformIDs(0, NULL, &platformIDCount);
if (platformIDCount == 0) {
cerr << "No OpenCL platform found" << endl;
return 1;
}
else {
cout << "Found " << platformIDCount << " platforms:" << endl;
}
// Get IDs of available platforms
vector<cl_platform_id> platformIDs(platformIDCount);
clGetPlatformIDs(platformIDCount, platformIDs.data(), NULL);
// Get names of available platforms
for (unsigned platformID = 0; platformID < platformIDCount; platformID++) {
clGetPlatformInfo(platformIDs[platformID], CL_PLATFORM_NAME, 0, NULL, &stringSize);
stringResult.resize(stringSize);
clGetPlatformInfo(platformIDs[platformID], CL_PLATFORM_NAME, stringSize, const_cast<char*> (stringResult.data()), NULL);
cout << (platformID) << ": " << stringResult << endl;
}
// Select platform
cout << "Please select a platform: " << endl;
cin >> selectedPlatform;
// Get amount of available devices
deviceIDCount = 0;
clGetDeviceIDs(platformIDs[selectedPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceIDCount);
if (deviceIDCount == 0) {
cerr << "No OpenCL devices found" << endl;
return 1;
}
else {
cout << "Found " << deviceIDCount << " devices:" << endl;
}
// Get IDs of available devices
vector<cl_device_id> deviceIDs(deviceIDCount);
clGetDeviceIDs(platformIDs[selectedPlatform], CL_DEVICE_TYPE_ALL, deviceIDCount, deviceIDs.data(), NULL);
// Get names of available devices
for (unsigned deviceID = 0; deviceID < deviceIDCount; deviceID++) {
clGetDeviceInfo(deviceIDs[deviceID], CL_DEVICE_NAME, 0, NULL, &stringSize);
stringResult.resize(stringSize);
clGetDeviceInfo(deviceIDs[deviceID], CL_DEVICE_NAME, stringSize, const_cast<char*> (stringResult.data()), NULL);
cout << (deviceID) << ": " << stringResult << endl;
}
// Select device
cout << "Please select a device: " << endl;
cin >> selectedDevice;
// Print selected device info
display_device_info(deviceIDs[selectedDevice]);
// Create device context
const cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties> (platformIDs[selectedPlatform]), 0, 0 };
cl_context context = clCreateContext(contextProperties, deviceIDCount, deviceIDs.data(), NULL, NULL, &success);
CheckForError(success);
cout << "Context created" << endl;
// Create program
ifstream in(programPath);
string programSource((istreambuf_iterator<char>(in)), istreambuf_iterator<char>());
size_t lengths[1] = { programSource.size() };
const char* sources[1] = { programSource.data() };
cl_program program = clCreateProgramWithSource(context, 1, sources, lengths, &success);
CheckForError(success);
cout << "Program created" << endl;
// Build program
success = clBuildProgram(program, deviceIDCount, deviceIDs.data(), NULL, NULL, NULL);
if (success == CL_BUILD_PROGRAM_FAILURE) {
// Print Build Info Log
size_t log_size;
clGetProgramBuildInfo(program, deviceIDs[selectedDevice], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *)malloc(log_size);
clGetProgramBuildInfo(program, deviceIDs[selectedDevice], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
}
CheckForError(success);
cout << "Program built" << endl;
// Create kernel
cl_kernel kernel = clCreateKernel(program, "kmeans", &success);
CheckForError(success);
cout << "Kernel created" << endl;
// Generate test data
// A 1024 height * 1024 width area
// 256 random mean values for 256 k
static const size_t testDataSize = 1024;
static const unsigned kCount = 16;
cl_short3 *input_x = (cl_short3 *)malloc(testDataSize * sizeof(cl_short3));
cl_short3 *input_c = (cl_short3 *)malloc(kCount * sizeof(cl_short3));
cl_short3 *output_z = (cl_short3 *)malloc(kCount * sizeof(cl_short3));
for (int j = 0; j < testDataSize; ++j) {
input_x[j].s0 = (rand() % (1 << 13)) - (1 << 13) / 2;
input_x[j].s1 = (rand() % (1 << 13)) - (1 << 13) / 2;
input_x[j].s2 = (rand() % (1 << 13)) - (1 << 13) / 2;
}
for (int j = 0; j < kCount; ++j) {
int idx = rand() % testDataSize;
input_c[j] = input_x[idx];
}
cout << "Test data generated" << endl;
cl_mem input_x_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, testDataSize * sizeof(cl_short3), (void *) input_x, NULL);
cl_mem input_c_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kCount * sizeof(cl_short3), (void *) input_c, NULL);
cl_mem output_z_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, kCount * sizeof(cl_short3), (void *) output_z, NULL);
cout << "Buffers created" << endl;
// Create command queue
cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceIDs[selectedDevice], 0, &success);
CheckForError(success);
cout << "Command queue created" << endl;
// Set kernel arguments
int argc = 0;
clSetKernelArg(kernel, argc++, sizeof(cl_mem), &input_x_buf);
clSetKernelArg(kernel, argc++, sizeof(cl_mem), &input_c_buf);
clSetKernelArg(kernel, argc++, sizeof(cl_mem), &output_z_buf);
// Set local memory arguments (if Scratchpad is available, otherwise fail)
clSetKernelArg(kernel, argc++, 2 * sizeof(cl_long), NULL);
clSetKernelArg(kernel, argc++, kCount * sizeof(cl_short3), NULL);
clSetKernelArg(kernel, argc++, kCount * sizeof(cl_int), NULL);
clSetKernelArg(kernel, argc++, kCount * sizeof(cl_long), NULL);
clSetKernelArg(kernel, argc++, kCount * sizeof(cl_int3), NULL);
clSetKernelArg(kernel, argc++, sizeof(unsigned), &kCount);
cout << "Kernel arguments set" << endl;
// Enqueue command to execute kernel on device
const size_t globalWorkSize[] = { testDataSize, 0, 0 };
success = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, globalWorkSize, 0, NULL, &kernel_event);
//success = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
CheckForError(success);
//cout << "Command enqueued" << endl;
// Read memory buffer for result data
success = clEnqueueReadBuffer(queue, output_z_buf, CL_TRUE, 0, kCount * sizeof(cl_short3), output_z, 0, NULL, &finish_event);
CheckForError(success);
cout << "Memory buffer for result data read" << endl;
success = clWaitForEvents(1, &finish_event);
success += clWaitForEvents(1, &kernel_event);
success += clFinish(queue);
CheckForError(success);
cout << "Command queue finished" << endl;
cout << "New centers:" << endl;
for (unsigned i = 0; i < kCount; i++) {
cout << i << ": " << output_z[i].s0 << ", " << output_z[i].s1 << ", " << output_z[i].s2 << endl;
}
// Clean resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(input_x_buf);
clReleaseMemObject(input_c_buf);
clReleaseMemObject(output_z_buf);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;
}
更新 我没有完全解决问题,但对于有类似问题的未来读者,我就是这样解决的。首先,我根据并行的高速公路建议更改了主机代码:
// Read memory buffer for result data
success = clEnqueueReadBuffer(queue, output_z_buf, CL_TRUE, 0, kCount * sizeof(cl_short3), output_z, 1, &kernel_event, &finish_event);
CheckForError(success);
cout << "Memory buffer for result data read" << endl;
success = clFlush(queue);
CheckForError(success);
success = clWaitForEvents(1, &finish_event);
CheckForError(success);
success = clFinish(queue);
CheckForError(success);
cout << "Command queue finished" << endl;
cout << "New centers:" << endl;
for (unsigned i = 0; i < kCount; i++) {
cout << i << ": " << output_z[i].s0 << ", " << output_z[i].s1 << ", " << output_z[i].s2 << endl;
}
// Clean resources
clReleaseMemObject(input_x_buf);
clReleaseMemObject(input_c_buf);
clReleaseMemObject(output_z_buf);
clReleaseCommandQueue(queue);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
虽然这应该可以解决output_z_buf无法释放的问题,但不知道怎么回事。我发现内核代码对这个问题有影响。当然,我在内核代码的最后写入输出缓冲区。但是,如果我这样做,在内核完成后,缓冲区在某种程度上不会完全关闭,从而在释放缓冲区时导致崩溃。当我写入输出缓冲区后,当我执行其他操作(实际操作,单独屏障无效)时,问题不会出现。然后代码看起来像这样:
// The algorithm doing its stuff
output_z_buf[gid0].s0 = x;
output_z_buf[gid0].s1 = y;
output_z_buf[gid0].s2 = z;
for (int i = 0; i < 10; i++) {
// do some other stuff
}
barrier(CLK_LOCAL_MEM_FENCE);
// End of Kernel code