所需大小远小于全局大小,因此看起来我的设备有足够的内存。为什么我会收到这个错误?
所需尺寸:39819160 全球规模:1709598311 当地大小:65536
内核有一个printf语句,但它没有打印任何内容,所以我猜内核是可以的。它至少适用于小数据。
相同的代码在具有更高全局大小限制的英特尔HD 430的另一台计算机上运行良好。
我使用相同的上下文和device_id一遍又一遍地运行此代码。 它在第一次迭代时失败。
void check_errors(cl_int err) {
if (err != CL_SUCCESS) {
cerr << err << endl;
exit(-1);
}
}
int SVM::fit(cl_float* x, cl_uint weight_size, cl_float* y, cl_uint data_size, cl_float h, cl_uint T) {
cl_int err;
if (data_size == 0 )
return 0;
set(x, weight_size, data_size);
cl_uint* a = (cl_uint*)calloc(data_size, sizeof(cl_uint));
cl_uint indices[T];
for(size_t t = 0; t < T; t++)
indices[t] = rand() % data_size;
const char* src = "/*float kernel_func(__global float* x, __global float* b, size_t size) {\n"
" float s = 0;\n"
" while( size > 0 ) {\n"
" size--;\n"
" s += x[size] * b[size];\n"
" }\n"
"\n"
" return s;\n"
"}\n"
"\n"
"\n*/"
"__kernel void fit(__global uint* indices, uint T, __global uint* a, __global float* x, uint weight_size, __global float* y, uint data_size, float h) {\n"
" /*uint id = get_global_id(0);\n"
" if (id >= T)\n"
" return;\n"
"\n"
" uint t = id + 1;\n"
" uint i = indices[id];\n"
"\n"
" float q = 1/(h*t);\n"
" float s = 0;\n"
"\n"
" for(int j = 0; j < data_size; j++)\n"
" if( j != i ) {\n"
" s += a[j] * y[j] * kernel_func(&x[i * weight_size], &x[j * weight_size], weight_size);\n"
" }\n"
"\n"
" if (y[i] * q * s < 1) {\n"
" atomic_add(&a[i], 1);\n"
" }\n"
"*/}\n";
cl_command_queue cmd_queue = clCreateCommandQueue(context, device_id, 0, &err);
check_errors(err);
cl_program program = clCreateProgramWithSource(context, 1, &src, NULL, &err);
check_errors(err);
if( clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS ) {
char log[999999];
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 999999, log, NULL);
cerr << log << endl;
std::exit(-1);
return -1;
}
cl_kernel opencl_kernel = clCreateKernel(program, "fit", &err);
check_errors(err);
size_t total_size = 0;
total_size += sizeof(cl_uint) * data_size;
cl_mem a_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * data_size, a, &err);
check_errors(err);
total_size += sizeof(cl_float) * weight_size * data_size;
cl_mem x_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * weight_size * data_size, x, &err);
check_errors(err);
total_size += sizeof(cl_float) * data_size;
cl_mem y_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * data_size, y, &err);
check_errors(err);
total_size += sizeof(cl_uint) * T;
cl_mem indices_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * T, indices, &err);
cout << "Required size: " << total_size << endl;
cl_ulong size;
err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
check_errors(err);
cout << "Global size: " << size << endl;
//cl_ulong size;
err = clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
check_errors(err);
cout << "Local size: " << size << endl;
err = clSetKernelArg(opencl_kernel, 0, sizeof(cl_mem), &indices_buffer);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 1, sizeof(cl_uint), &T);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 2, sizeof(cl_mem), &a_buffer);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 3, sizeof(cl_mem), &x_buffer);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 4, sizeof(cl_uint), &weight_size);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 5, sizeof(cl_mem), &y_buffer);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 6, sizeof(cl_uint), &data_size);
check_errors(err);
err = clSetKernelArg(opencl_kernel, 7, sizeof(cl_float), &h);
check_errors(err);
size_t localWorkSize = 16 ;
size_t numWorkGroups = (T + localWorkSize - 1) / localWorkSize;
size_t globalWorkSize = numWorkGroups * localWorkSize;
err = clEnqueueNDRangeKernel(cmd_queue, opencl_kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
check_errors(err);
err = clFinish(cmd_queue);
if (err != CL_SUCCESS) {
cerr << endl;
cerr << err << endl;
cout << "Data size: " << data_size << endl;
cout << "Weight size: " << weight_size << endl;
/*for(int i = 0; i < data_size; i++) {
for(int yi = 0; yi < 28; yi++) {
for(int xi = 0; xi < 28; xi++) {
if (x[i * weight_size + yi*28 + xi] == 0)
cout << " ";
else
cout << "*";
}
cout << endl;
}
cout << (int)y[i] << endl;
}*/
exit(-1);
}
clReleaseMemObject(indices_buffer);
clReleaseMemObject(a_buffer);
clReleaseMemObject(x_buffer);
clReleaseMemObject(y_buffer);
clReleaseKernel(opencl_kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
float q = 1/(h*T);
v = (float*)malloc(sizeof(float) * data_size);
for(size_t j = 0; j < data_size; j++) {
v[j] = q * a[j] * y[j];
}
w = (float*) calloc(weight_size, sizeof(float));
float cur_x[weight_size];
for(size_t i = 0; i < data_size; i++) {
memcpy(cur_x, &x[i * weight_size], weight_size * sizeof(float));
produce_vector(cur_x, weight_size, v[i]);
add_to_vector(w, cur_x, weight_size);
}
free(a);
return 0;
}