Question

我是开放式CL的新手，因此如果有人可以提供帮助，我将不胜感激。我有一个小内核，总是以CL_INVALID_COMMAND_QUEUE错误结束。我试过不同的硬件gtx 765m或gtx 980，结果是一样的。这是代码host + kernel：

//get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if(all_platforms.size()==0){
    std::cout<<" No platforms found. Check OpenCL installation!\n";
    exit(1);
}
cl::Platform default_platform=all_platforms[1];
std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";

//get default device of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if(all_devices.size()==0){
    std::cout<<" No devices found. Check OpenCL installation!\n";
    exit(1);
}
cl::Device default_device=all_devices[0];
std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";

cl::Context context({default_device});

cl::Program::Sources sources;

std::string kernel_code=
        "__kernel void test(__global float* A,__global float* R) {"
            "int i = get_global_id(0);"
            "if(i<3000) {"
                "R[i]=0;"
                "return;"
            "} "
            "float vm=0;"
            "for(int j=i-3000;j<=i;++j)"
                "vm+=A[j];"
            "R[i]=vm;"
        "}";

sources.push_back({kernel_code.c_str(),kernel_code.length()});

cl::Program program(context,sources);
if(program.build({default_device})!=CL_SUCCESS){
    std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)<<"\n";
    exit(1);
}

size_t n=1075021;
// create buffers on the device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*n);
cl::Buffer buffer_R(context,CL_MEM_READ_WRITE,sizeof(float)*n);

float *A = new float[n];
float *R = new float[n];

srand (time(NULL));

for(size_t i=0;i<n;++i)
    A[i]=rand()%10;

//create queue to which we will push commands for the device.
cl::CommandQueue queue(context,default_device);

//write arrays A to the device
queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(float)*n,A);
queue.finish();

//run the kernel
cl_int ret;
cl::Kernel kernel_test=cl::Kernel(program,"test");
kernel_test.setArg(0,buffer_A);
kernel_test.setArg(1,buffer_R);
queue.enqueueNDRangeKernel(kernel_test,cl::NullRange,cl::NDRange(n),cl::NullRange);
ret=queue.finish();

//read result R from the device to array R
ret=queue.enqueueReadBuffer(buffer_R,CL_TRUE,0,sizeof(float)*n,R);

Answer 1

总结到目前为止我们在评论中得到的结论：

全球工作项目的数量设定为1075021，这是一个素数。最初是让实现决定如何划分工作，这很可能被划分为1075021个工作组，每个工作组有1个工作项，除了非常低效之外很可能达到nvidia硬件限制。

将全局大小四舍五入到最接近的1024倍并明确指定本地大小为1024时，这解决了偏移量为3000的问题，但是如果offset设置为65000仍然存在问题。请注意，舍入需要在内核中添加额外的检查因为没有尝试处理超过1075021个元素。

对于65000的偏移量，前65000个工作项的内核只设置R [0] = 0。这是另一个很大的低效率。这可以在主机上完成，内核可以安排从65000开始处理。以下是如何做到这一点：

首先是内核：

std::string kernel_code=
        "__kernel void test(__global float* A,__global float* R) {"
            "int i = get_global_id(0);"
            "if(i >= 1075021)" // don't process if over
            "    return;"
            "float vm=0;"
            "for(int j=i-65000;j<=i;++j)"
                "vm+=A[j];"
            "R[i]=vm;"
    "}";

现在使用偏移量：

size_t n=1075021;
size_t offset=65000; // define our offset
// create buffers on the device
cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(float)*n);
cl::Buffer buffer_R(context,CL_MEM_READ_WRITE,sizeof(float)*n);

float *A = new float[n];
float *R = new float[n];
memset(R,0,sizeof(float)*offset); // set to zero first 'offset' elements

并考虑到偏移量来安排内核：

int local_size=1024;
int global_size = ((n-offset)/local_size+1)*local_size; // calculate required global size taking into account the offset
queue.enqueueNDRangeKernel(kernel_test,cl::NDRange(offset),cl::NDRange(global_size),cl::NDRange(local_size)); // schedule kernel using our offset
ret=queue.finish();

//read result R from the device to array R
ret=queue.enqueueReadBuffer(buffer_R,CL_TRUE,0,sizeof(float)*(n-offset),&R[offset]); // read modified data only

另请注意，如果执行时间过长，硬件看门狗定时器可能会杀死内核 - 我认为通常超过5秒。如果是这种情况，您可以考虑将超时增加到60秒或禁用它。

CL_INVALID_COMMAND_QUEUE问题

1 个答案: