我有一个大问题(在Linux上): 我创建了一个包含已定义数据的缓冲区,然后OpenCL内核获取此数据并将其放入image2d_t。在使用AMD C50(Fusion CPU / GPU)时,程序可以正常工作,但在我的GeForce 9500 GT上,给定的内核很少计算出正确的结果。有时结果是正确的,但通常是不正确的。有时它取决于非常奇怪的变化,如删除未使用的变量声明或添加换行符。我意识到禁用优化会增加失败的可能性。我在两个系统中都拥有最实际的显示驱动程序。
这是我的简化代码:
#include <CL/cl.h>
#include <string>
#include <iostream>
#include <sstream>
#include <cmath>
void checkOpenCLErr(cl_int err, std::string name){
const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
if (err != CL_SUCCESS) {
std::stringstream str;
str << errorString[-err] << " (" << err << ")";
throw std::string(name)+(str.str());
}
}
int main(){
try{
cl_context m_context;
cl_platform_id* m_platforms;
unsigned int m_numPlatforms;
cl_command_queue m_queue;
cl_device_id m_device;
cl_int error = 0; // Used to handle error codes
clGetPlatformIDs(0,NULL,&m_numPlatforms);
m_platforms = new cl_platform_id[m_numPlatforms];
error = clGetPlatformIDs(m_numPlatforms,m_platforms,&m_numPlatforms);
checkOpenCLErr(error, "getPlatformIDs");
// Device
error = clGetDeviceIDs(m_platforms[0], CL_DEVICE_TYPE_GPU, 1, &m_device, NULL);
checkOpenCLErr(error, "getDeviceIDs");
// Context
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(m_platforms[0]), 0};
m_context = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
// m_private->m_context = clCreateContext(properties, 1, &m_private->m_device, NULL, NULL, &error);
checkOpenCLErr(error, "Create context");
// Command-queue
m_queue = clCreateCommandQueue(m_context, m_device, 0, &error);
checkOpenCLErr(error, "Create command queue");
//Build program and kernel
const char* source = "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n"
"\n"
"__kernel void bufToImage(__global unsigned char* in, __write_only image2d_t out, const unsigned int offset_x, const unsigned int image_width , const unsigned int maxval ){\n"
"\tint i = get_global_id(0);\n"
"\tint j = get_global_id(1);\n"
"\tint width = get_global_size(0);\n"
"\tint height = get_global_size(1);\n"
"\n"
"\tint pos = j*image_width*3+(offset_x+i)*3;\n"
"\tif( maxval < 256 ){\n"
"\t\tfloat4 c = (float4)(in[pos],in[pos+1],in[pos+2],1.0f);\n"
"\t\tc.x /= maxval;\n"
"\t\tc.y /= maxval;\n"
"\t\tc.z /= maxval;\n"
"\t\twrite_imagef(out, (int2)(i,j), c);\n"
"\t}else{\n"
"\t\tfloat4 c = (float4)(255.0f*in[2*pos]+in[2*pos+1],255.0f*in[2*pos+2]+in[2*pos+3],255.0f*in[2*pos+4]+in[2*pos+5],1.0f);\n"
"\t\tc.x /= maxval;\n"
"\t\tc.y /= maxval;\n"
"\t\tc.z /= maxval;\n"
"\t\twrite_imagef(out, (int2)(i,j), c);\n"
"\t}\n"
"}\n"
"\n"
"__constant sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
"\n"
"__kernel void imageToBuf(__read_only image2d_t in, __global unsigned char* out, const unsigned int offset_x, const unsigned int image_width ){\n"
"\tint i = get_global_id(0);\n"
"\tint j = get_global_id(1);\n"
"\tint pos = j*image_width*3+(offset_x+i)*3;\n"
"\tfloat4 c = read_imagef(in, imageSampler, (int2)(i,j));\n"
"\tif( c.x <= 1.0f && c.y <= 1.0f && c.z <= 1.0f ){\n"
"\t\tout[pos] = c.x*255.0f;\n"
"\t\tout[pos+1] = c.y*255.0f;\n"
"\t\tout[pos+2] = c.z*255.0f;\n"
"\t}else{\n"
"\t\tout[pos] = 200.0f;\n"
"\t\tout[pos+1] = 0.0f;\n"
"\t\tout[pos+2] = 255.0f;\n"
"\t}\n"
"}\n";
cl_int err;
cl_program prog = clCreateProgramWithSource(m_context,1,&source,NULL,&err);
if( -err != CL_SUCCESS ) throw std::string("clCreateProgramWithSources");
err = clBuildProgram(prog,0,NULL,"-cl-opt-disable",NULL,NULL);
if( -err != CL_SUCCESS ) throw std::string("clBuildProgram(fromSources)");
cl_kernel kernel = clCreateKernel(prog,"bufToImage",&err);
checkOpenCLErr(err,"CreateKernel");
cl_uint imageWidth = 80;
cl_uint imageHeight = 90;
//Initialize datas
cl_uint maxVal = 255;
cl_uint offsetX = 0;
int size = imageWidth*imageHeight*3;
int resSize = imageWidth*imageHeight*4;
cl_uchar* data = new cl_uchar[size];
cl_float* expectedData = new cl_float[resSize];
for( int i = 0,j=0; i < size; i++,j++ ){
data[i] = (cl_uchar)i;
expectedData[j] = (cl_float)((unsigned char)i)/255.0f;
if ( i%3 == 2 ){
j++;
expectedData[j] = 1.0f;
}
}
cl_mem inBuffer = clCreateBuffer(m_context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,size*sizeof(cl_uchar),data,&err);
checkOpenCLErr(err, "clCreateBuffer()");
clFinish(m_queue);
cl_image_format imgFormat;
imgFormat.image_channel_order = CL_RGBA;
imgFormat.image_channel_data_type = CL_FLOAT;
cl_mem outImg = clCreateImage2D( m_context, CL_MEM_READ_WRITE, &imgFormat, imageWidth, imageHeight, 0, NULL, &err );
checkOpenCLErr(err,"get2DImage()");
clFinish(m_queue);
size_t kernelRegion[]={imageWidth,imageHeight};
size_t kernelWorkgroup[]={1,1};
//Fill kernel with data
clSetKernelArg(kernel,0,sizeof(cl_mem),&inBuffer);
clSetKernelArg(kernel,1,sizeof(cl_mem),&outImg);
clSetKernelArg(kernel,2,sizeof(cl_uint),&offsetX);
clSetKernelArg(kernel,3,sizeof(cl_uint),&imageWidth);
clSetKernelArg(kernel,4,sizeof(cl_uint),&maxVal);
//Run kernel
err = clEnqueueNDRangeKernel(m_queue,kernel,2,NULL,kernelRegion,kernelWorkgroup,0,NULL,NULL);
checkOpenCLErr(err,"RunKernel");
clFinish(m_queue);
//Check resulting data for validty
cl_float* computedData = new cl_float[resSize];;
size_t region[]={imageWidth,imageHeight,1};
const size_t offset[] = {0,0,0};
err = clEnqueueReadImage(m_queue,outImg,CL_TRUE,offset,region,0,0,computedData,0,NULL,NULL);
checkOpenCLErr(err, "readDataFromImage()");
clFinish(m_queue);
for( int i = 0; i < resSize; i++ ){
if( fabs(expectedData[i]-computedData[i])>0.1 ){
std::cout << "Expected: \n";
for( int j = 0; j < resSize; j++ ){
std::cout << expectedData[j] << " ";
}
std::cout << "\nComputed: \n";
std::cout << "\n";
for( int j = 0; j < resSize; j++ ){
std::cout << computedData[j] << " ";
}
std::cout << "\n";
throw std::string("Error, computed and expected data are not the same!\n");
}
}
}catch(std::string& e){
std::cout << "\nCaught an exception: " << e << "\n";
return 1;
}
std::cout << "Works fine\n";
return 0;
}
我还为您上传了源代码,以便更轻松地对其进行测试: http://www.file-upload.net/download-3524302/strangeOpenCLError.cpp.html
如果我做错了,你能告诉我吗? 代码中是否有任何错误,或者这是我的驱动程序中的错误?
最好的reagards, 亚历
编辑:稍微更改了程序(包括:此处和链接的程序),以使其更容易出现不匹配。
答案 0 :(得分:1)
我发现了这个错误,这很烦人:
在linux下工作并且只是将OpenCL程序与最实际的“OpenCV”库(是的,计算库)相连接时,内核的二进制部分会被编译并缓存在〜/ .nv中。< / p>
您能否安装实际的OpenCV库并执行以下命令:
生成坏内核有时可能导致不良行为:
rm -R ~/.nv && g++ strangeOpenCLError.cpp -lOpenCL -lopencv_gpu -o strangeOpenCLError && ./strangeOpenCLError && ls -la ~/.nv/ComputeCache/*/*
生成符合要求的良好内核:
rm -R ~/.nv && g++ strangeOpenCLError.cpp -lOpenCL -o strangeOpenCLError && ./strangeOpenCLError && ls -la ~/.nv/ComputeCache/*/*
在我的系统中使用-lopencv_gpu或-lopencv_core时,由于二进制部分略有不同,我在〜/ .nv中获得了一个稍微具有其他大小的内核对象。因此,这些较小的内核在我的系统中计算出不良结果。
问题是错误并不总是出现:有时只是在处理缓冲区时,这些缓冲区足够大。因此,更可靠的测量是不同的内核缓存大小。我在我的问题中编辑了程序,现在它更有可能产生不好的结果。
祝你好运, 亚历
PS:我还在NVidia上创建了一个错误报告,它正在进行中。他们可以重现他们系统上的错误。
答案 1 :(得分:1)
要关闭Nvidia编译器缓存,请设置env。变量CUDA_CACHE_DISABLE = 1。这有助于避免将来出现这个问题。
答案 2 :(得分:0)
排队
m_context = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
你应该使用&amp; error作为最后一个参数来获得有意义的错误。没有它我得到一些愚蠢的错误信息。 (我需要更改平台才能获得GPU板。)
我无法使用我的nVidia GeForce 8600 GTS重现错误。我得到'工作正常'。我试了20次,没有任何问题。
除了您的代码有点令人困惑之外,我也看不到任何错误。您应该删除所有注释掉的代码,并引入一些空白行来对代码进行分组。
你有最新的司机吗?你描述的行为听起来非常熟悉,就像一个未初始化的缓冲区或变量,但我没有看到类似的东西。