Question

我为OpenCL程序编写了这个Mandelbrot内核。为了测试，我决定将所有复杂的平面放在矢量上。我的问题是，当我打印输出时，我获得了1的列表（如结果数组的初始化），而不是内核工作的结果。

我在哪里可以解决问题？

    #include <iostream>

    #ifdef __APPLE__
    #include <OpenCL/opencl.h>
    #else
    #include <CL/cl.h>
    #endif

    int main(){
    using namespace std;
    int xPixel=100;
    int yPixel=100;
    float ics[xPixel];
    for(int i=0;i<xPixel;++i)
    ics[i]=-2+i*((float)4/xPixel);
  float ypsilon[yPixel];
  for(int i=0;i<yPixel;++i)
    ypsilon[i]=-2+i*((float)4/yPixel);
  int results[xPixel*yPixel];
  for(int i=0;i<xPixel*yPixel;++i)
    results[i]=1;

  cl_context context;
  cl_context_properties properties[3];
  cl_kernel kernel;
  cl_command_queue command_queue;
  cl_program program;
  cl_int err;
  cl_uint num_of_platforms=0;
  cl_platform_id platform_id;
  cl_device_id device_id;
  cl_uint num_of_devices=0;
  cl_mem memX, memY, memOutput;
  size_t global;

const char *KernelSource =
"__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output){\n"\
"size_t id=get_global_id(0);\n"\
"int yPixel=100;\n"\
"for(int i=0;i<yPixel;i++){\n"\
"float x=0;\n"\
"float y=0;\n"\
"int counter=0;\n"\
"while(counter<1000){\n"\
"if(x*x+y*y>2*2){\n"\
"output[(id*yPixel)+i]=counter;\n"\
"break;\n"\
"}\n"\
"float xTemp=x*x-y*y+ics[id];\n"\
"y=2*x*y+ypsilon[i];\n"\
"x=xTemp;\n"\
"counter++;\n"\
"}\n"\
"}\n"\
"}\n";

  // retreives a list of platforms available
  if (clGetPlatformIDs(1, &platform_id, &num_of_platforms)!= CL_SUCCESS){
    cout<<"Unable to get platform_id\n"<<endl;;
    return 1;
  }

  // try to get a supported GPU device
  if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,&num_of_devices) != CL_SUCCESS){
    cout<<"Unable to get device_id\n"<<endl;
    return 1;
  }

  //context properties list - nust be terminated with 0
  properties[0]=CL_CONTEXT_PLATFORM;
  properties[1]=(cl_context_properties)platform_id;
  properties[2]=0;

  //create a context with the GPU device
  context=clCreateContext(properties,1,&device_id,NULL,NULL,&err);

  //create a command queue using the context and device
  command_queue=clCreateCommandQueue(context,device_id,0,&err);

  //create a program from the kernel source code
  program=clCreateProgramWithSource(context,1,(const char**)&KernelSource,NULL,&err);

  //compile the program
  if(clBuildProgram(program,0,NULL,NULL,NULL,NULL)!=CL_SUCCESS){
    cout<<"Error building program"<<endl;
    return 1;
  }

  //specify which kernel from the program to execute
  kernel=clCreateKernel(program,"mandelbrot",&err);

  //create buffers for input and output
  memX=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*xPixel,NULL,NULL);
  memY=clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(float)*yPixel,NULL,NULL);
  memOutput=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(int)*(xPixel*yPixel),NULL,NULL);

  //load data into the input buffer
  clEnqueueWriteBuffer(command_queue,memX,CL_TRUE,0,sizeof(float)*xPixel,ics,0,NULL,NULL);
  clEnqueueWriteBuffer(command_queue,memY,CL_TRUE,0,sizeof(float)*yPixel,ypsilon,0,NULL,NULL);

  //set the argument list for the kernel command
  clSetKernelArg(kernel,0,sizeof(cl_mem),&memX);
  clSetKernelArg(kernel,1,sizeof(cl_mem),&memY);
  clSetKernelArg(kernel,2,sizeof(cl_mem),&memOutput);
  global=xPixel*yPixel;

  //enqueue the kernel command for execution
  clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,&global,NULL,0,NULL,NULL);
  clFinish(command_queue);

  //copy the results from out of the output buffer
  clEnqueueReadBuffer(command_queue,memOutput,CL_TRUE,0,sizeof(int)*(xPixel*yPixel),results,0,NULL,NULL);

  //print output
   for(int i=0;i<xPixel;++i){
     for(int j=0;j<yPixel;++j){
       cout<<results[(i*yPixel)+j]<<" ";
     }
     cout<<endl;
   }

  //cleanup - release OpenCL resources
  clReleaseMemObject(memX);
  clReleaseMemObject(memY);
  clReleaseMemObject(memOutput);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(command_queue);
  clReleaseContext(context);
}

Answer 1

我没有看到确切的原因，但我确实有一个问题：如果你在每个元素上运行这个，那么什么是“i”循环“yPixel”？看起来你正在做X * Y * Y而不是X * Y工作（你的全局大小是X * Y然后内核再次在Y上循环）。

如果在“i”循环之前添加“output [（id * yPixel）+ i] = 42”，那么输出缓冲区保持什么？这将告诉您问题是在您的内核还是主机代码中。

为了帮助其他人看这个，我重新格式化了内核代码：

__kernel void mandelbrot(__global float *ics, __global float *ypsilon, __global int *output) { size_t id=get_global_id(0); int yPixel=100; for(int i=0;i<yPixel;i++) { float x=0; float y=0; int counter=0; while(counter<1000) { if(x*x+y*y>2*2) { output[(id*yPixel)+i]=counter; break; } float xTemp=x*x-y*y+ics[id]; y=2*x*y+ypsilon[i]; x=xTemp; counter++; } } }

Mandelbrot在OpenCL中

1 个答案: