何时或为什么clEnqueueNDRangeKernel返回Null为事件?

时间:2016-12-18 17:55:57

标签: c concurrency opencl

您好我正试图在7周内从7本书Concurreny Models中提取示例代码。作者使用macbook,而我正在使用带有Windows 10的dell xps。

我的程序崩溃是因为在调用函数timing_eventclEnqueueNDRangeKernel()仍然为空。

cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
    NULL, 0, NULL,&timing_event);

关于事件参数

docs状态
  

事件

     

返回标识此特定内核的事件对象   执行实例。事件对象是唯一的,可用于   稍后识别特定的内核执行实例。如果事件是   NULL,不会为此内核执行实例创建任何事件   因此,应用程序无法查询或   排队等待这个特定的内核执行实例。

sombody可以解释为什么这会发生在我的戴尔而不是作者的macbook上?

1 个答案:

答案 0 :(得分:0)

我找到了解决方案。这个问题并非来自clEnqueueNDRangeKernel()早先在clBuildProgram(program, 0, NULL, NULL, NULL, NULL);中发生的问题。我用clGetProgramBuildInfo()检索了构建信息。问题是我的 multiply_arrays.cl 文件不是 utf 8编码

对于每个不熟悉opencl的人。每个opencl函数都返回一个状态整数,该整数映射到特定的错误代码。如果函数确实返回不返回状态代码,则可以传递指向该函数的指针。请参阅下面链接的示例函数。这对调试程序非常有用。

Returns Status Code

Status Code by Reference

<强>的main.cpp

/***
* Excerpted from "Seven Concurrency Models in Seven Weeks",
* published by The Pragmatic Bookshelf.
* Copyrights apply to this code. It may not be used to create training material,
* courses, books, articles, and the like. Contact us if you are in doubt.
* We make no guarantees that this code is fit for any purpose.
* Visit http://www.pragmaticprogrammer.com/titles/pb7con for more book information.
***/
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <mach/mach_time.h>
#else  
#include <CL/cl.h>
#include <Windows.h>

#endif

#include <stdio.h>
#include<iostream>
#include <inttypes.h>
#include <chrono>

#define NUM_ELEMENTS (100000)

char* read_source(const char* filename) {
    FILE *h = fopen(filename, "r");
    fseek(h, 0, SEEK_END);
    size_t s = ftell(h);
    rewind(h);
    char* program = (char*)malloc(s + 1);
    fread(program, sizeof(char), s, h);
    program[s] = '\0';
    fclose(h);
    return program;
}

void random_fill(cl_float array[], size_t size) {
    for (int i = 0; i < size; ++i)
        array[i] = (cl_float)rand() / RAND_MAX;
}

int main() {
    //Status for Errorhandling
    cl_int status;

    //Identify Platform
    cl_platform_id platform;
    clGetPlatformIDs(1, &platform, NULL);

    //Get Id of GPU
    cl_device_id device;
    cl_uint num_devices = 0;
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);

    // Create Context
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);


    //Use context to create Command Queue
    //Que enables us to send commands to the gpu device
    cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

    //Load Kernel
    char* source = read_source("multiply_arrays.cl");
    cl_program program = clCreateProgramWithSource(context, 1,
        (const char**)&source, NULL, &status);
    free(source);

    // Build Program
    status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    size_t len;
    char *buffer;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
    buffer = (char *) malloc(len);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
    printf("%s\n", buffer);

    //Create Kernel
    cl_kernel kernel = clCreateKernel(program, "multiply_arrays", &status);

    // Create Arrays with random Numbers
    cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
    random_fill(a, NUM_ELEMENTS);
    random_fill(b, NUM_ELEMENTS);

    //uint64_t startGPU = mach_absolute_time();
    auto start = std::chrono::high_resolution_clock::now();


    //Create Readonly input Buffers with value from a and b
    cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
    cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_float) * NUM_ELEMENTS, b, NULL);

    //Create Output buffer write Only
    cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
        sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);

    //set Kernel Arguments
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

    cl_event timing_event;
    size_t work_units = NUM_ELEMENTS;
    status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
        NULL, 0, NULL,&timing_event);

    cl_float results[NUM_ELEMENTS];
    //Calculate Results and copy from output buffer to results
    clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
        results, 0, NULL, NULL);

    //uint64_t endGPU = mach_absolute_time();
    auto finish = std::chrono::high_resolution_clock::now();
    //printf("Total (GPU): %lu ns\n\n", (unsigned long)(endGPU - startGPU));
    std::cout << "Total(GPU) :"<< std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";

    cl_ulong starttime;
    clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
        sizeof(cl_ulong), &starttime, NULL);
    cl_ulong endtime;
    clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
        sizeof(cl_ulong), &endtime, NULL);
    printf("Elapsed (GPU): %lu ns\n\n", (unsigned long)(endtime - starttime));
    clReleaseEvent(timing_event);
    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    //uint64_t startCPU = mach_absolute_time();
    start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < NUM_ELEMENTS; ++i)
        results[i] = a[i] * b[i];

    //uint64_t endCPU = mach_absolute_time();
    finish = std::chrono::high_resolution_clock::now();
    //printf("Elapsed (CPU): %lu ns\n\n", (unsigned long)(endCPU - startCPU));
    std::cout << "Elapsed (CPU) :" << std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
    return 0;
}

<强> multiply_arrays.cl

__kernel void multiply_arrays(__global const float* inputA,
    __global const float* inputB,
    __global float* output) {

    int i = get_global_id(0);
    output[i] = inputA[i] * inputB[i];
}
//ö