OpenCL 2D阵列实现

时间:2016-11-21 04:37:32

标签: c++ visual-studio-2015 parallel-processing opencl

目前正在使用OpenCL进行2D阵列实现。在大多数情况下,当matrix_size小于15或更小时,这一切都很好。当我将它增加到类似100的程序时程序崩溃。根据visual studio调试器,问题似乎是整数除以0.我不太确定可能发生的位置。 我的假设:这是工作项和工作组的问题:

queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(matrix_size*matrix_size), cl::NullRange);

不幸的是,我不太确定如何解决这个问题。最终,我希望能够在相对较大的数据集上运行这些基本计算。

int main() {
`   srand((unsigned int)time(NULL));
    const int matrix_size = 100;
    string input;
    string func;
    string input_file;
    cout << "Please enter a arithmetic option: multi or add" << endl;
    cout << ">> ";
    input_file = "MatrixArithmetic.cl";
    getline(cin, input);
    if (input[0] == 'a') {func = "matrix_add";}
    else if (input[0] == 'm') {func = "matrix_multi";}
    else { cout << "Not a valid option... exiting" << endl; return 0; }
    ifstream ArithmeticFile(input_file);
    string src(istreambuf_iterator<char>(ArithmeticFile), (istreambuf_iterator<char>()));


    //prepare platform
    vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    auto platform = platforms.front();
    //gather device info from platform and store into devices vector
    vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
    //chose device for computation
    auto device = devices.front();
    cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>() << endl;
    /*cout << "This: " << CL_DEVICE_MAX_MEM_ALLOC_SIZE << endl;
    cout << "This 2: " << CL_DEVICE_MAX_MEM_ALLOC_SIZE << endl;*/
    //setup the context

    cl::Program::Sources sources;
    sources.push_back({ src.c_str(), src.length() });
    cl::Context context(device);
    cl::Program program(context, sources);
    auto err = program.build("-cl-std=CL1.2");
    //setup kernel (this is kernel specific)
    cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
    cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
    cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
    //build and seed matrix using random for computation this is done on the main processor
    float vec1[matrix_size][matrix_size];
    float vec2[matrix_size][matrix_size];
    for (int x = 0; x < matrix_size; x++) {
        for (int y = 0; y < matrix_size; y++) {
            vec1[x][y] = (float)(1+rand()%(rand()%1000));
            vec2[x][y] = (float)(1+rand()%(rand()%1000));
        }
    }
    //queue setup for pushing commands to device
    cl::CommandQueue queue(context, device);
    //write vec1 and vec2 to device
    queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec1);
    queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec2);
    ////run the kernel
    cl::Kernel kernel = cl::Kernel(program, func.c_str());
    //pushing argument to kernel it has 3 total arguments
    kernel.setArg(0, buffer_A);
    kernel.setArg(1, buffer_B);
    kernel.setArg(2, buffer_C);
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(matrix_size*matrix_size), cl::NullRange);
    queue.finish();
    //writing to the buffer 
    float vec3[matrix_size][matrix_size];
    queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec3);
    cin.get();
    return 0;
}


Kernel:

    __kernel void matrix_add(__global const float *A, __global float *B, 

    __global float *C)
    {
        //index of the current element
        int x = get_global_id(0);
        //operation
        C[x] = A[x] + B[x];
    }

    __kernel void matrix_multi(__global const float *A, __global const float *B, __global float *C)
    {
        //OpenCL does not take 2D arrays it had to be flatten
        //index of the current element
        int x = get_global_id(0);
        //operation
        C[x] = A[x] * B[x];
}

规格:i5 4690K和AMD r9 290 8GB Ram。所以记忆不应该成为每个问题&#34; Matrix&#34;应该占用大约4000字节,大小为100x100

2 个答案:

答案 0 :(得分:0)

罪魁祸首: rand()%1000有时评估为0。

vec1[x][y] = (float)(1+rand()%(1+rand()%1000));
vec2[x][y] = (float)(1+rand()%(1+rand()%1000));

更改为:

vec1[x][y] = (float)(1+rand()%(1+rand()%1000));
vec2[x][y] = (float)(1+rand()%(1+rand()%1000));

答案 1 :(得分:0)

您应该使用包装器将1-D数组(或分配)视为2-D数组:

MDArr buf = new MDArr(100,100); // allocates 10k memory and gets its pointer
buf[10][90]=3; // this is overloaded operator usage for host-side
clEnqueueWriteBuffer(.,..,buf.ptr()) // 0th addreess of contiguous mem
                                    // same as &vec1[0][0]

所以它不能访问非连续的禁区。

编辑:100x100浮点数组必须是4 * 100 * 100 = x86 cpu的40k内存区域