Question

我在OpenCL上工作，而我只有一个CPU i3核心Duo =＆gt;我只拥有一台设备（我的CPU）。所以基本上，我猜我的HOST（cpu）也将是DEVICE。我尝试启动内核但分配给DEVICE（也是主机）的任务永远不会终止。在考虑了这个问题后，很明显等待DEVICE（本身）完成的HOST是不可能的。但有人知道克服这个问题的方法吗？也许使用clCreateSubDevice，将我唯一的设备细分为主机和真实设备？

Answer 1

你会在下面找到我的类似java的代码，以便让我知道我的错误。实际上当我在没有clFinish（commandQueue）的情况下运行以下代码时; （在代码底部），我有以下输出：

我使用平台Intel（R）OpenCL 排队内核...... 暂停15000毫秒。任务INCOMPLETE

如果我添加clFinish（commandQueue），我有输出，我的任务完成了：

我使用平台Intel（R）OpenCL 排队内核...... 事件内核状态：CL_COMPLETE事件ID：10运行时：2.631ms 暂停15000毫秒。任务完成

那么为什么单个clFinish（）指令允许我完成任务？提前谢谢你的解释。

public class Test_CPU
{


    private static String programSource0 =
        "__kernel void vectorAdd(" +
        "     __global const float *a,"+
        "     __global const float *b, " +
        "     __global float *c)"+
        "{"+
        "    int gid = get_global_id(0);"+
        "    c[gid] = a[gid]+b[gid];"+
        "}";

    /**
     * The entry point of this sample
     *
     * @param args Not used
     */
    public static void main(String args[])
    {
        /**
        * Callback function that is called when the event ev has the event_status status and will display the runtime of execution kernel in seconds
        * @param event:        the event
        * @param event_status: status of the event
        * @param user_data:    data given by the user is an integer tag that can be used to match profiling output to the associated kernel
        * @return:             none
        */
        EventCallbackFunction kernelCommandEvent = new EventCallbackFunction()
        {
            @Override
            public void function(cl_event event, int event_status, Object user_data)
            {
                int evID = (int)user_data;
                long[] ev_start_time = new long[1];
                Arrays.fill(ev_start_time, 0);
                long[] ev_end_time = new long[1];
                Arrays.fill(ev_end_time, 0);
                long[] return_bytes = new long[1];
                double run_time = 0.0;

                clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_QUEUED, Sizeof.cl_long, Pointer.to(ev_start_time), return_bytes);
                clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_END   , Sizeof.cl_long, Pointer.to(ev_end_time), return_bytes);

                run_time = (double)(ev_end_time[0] - ev_start_time[0]);
                System.out.println("Event kernel status: " + CL.stringFor_command_execution_status(event_status) + " event ID: " + evID + " runtime: " + String.format("%8.3f", (run_time*1.0e-6)) + " ms.");
            }
        };

        // Initialize the input data
        int n = 1000000;
        float srcArrayA[] = new float[n];
        float srcArrayB[] = new float[n];
        float dstArray0[] = new float[n];

        for (int i=0; i<srcArrayA.length; i++)
        {
            srcArrayA[i] = i;
            srcArrayB[i] = i;
        }
        Pointer srcA = Pointer.to(srcArrayA);
        Pointer srcB = Pointer.to(srcArrayB);
        Pointer dst0 = Pointer.to(dstArray0);

        // The platform, device type and device number that will be used
        final int platformIndex = 1;
        final long deviceType = CL_DEVICE_TYPE_CPU;
        final int deviceIndex = 0;

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);

        // Obtain the number of platforms
        int numPlatformsArray[] = new int[1];
        clGetPlatformIDs(0, null, numPlatformsArray);
        int numPlatforms = numPlatformsArray[0];

        // Obtain a platform ID
        cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_platform_id platform = platforms[platformIndex];

        long size[] = new long[1];
        clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, null, size);
        // Create a buffer of the appropriate size and fill it with the info
        byte buffer[] = new byte[(int)size[0]];
        clGetPlatformInfo(platform, CL_PLATFORM_NAME, buffer.length, Pointer.to(buffer), null);
        // Create a string from the buffer (excluding the trailing \0 byte)
        System.out.println("I use the platform " +  new String(buffer, 0, buffer.length-1));

        // Initialize the context properties
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);

        // Obtain the number of devices for the platform
        int numDevicesArray[] = new int[1];
        clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
        int numDevices = numDevicesArray[0];

        // Obtain a device ID 
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
        cl_device_id device = devices[deviceIndex];

        // Create a context for the selected device
        cl_context context = clCreateContext(contextProperties, 1, new cl_device_id[]{device}, null, null, null);

        // Create a command-queue, with profiling info enabled
        long properties = 0;
        properties |= CL.CL_QUEUE_PROFILING_ENABLE;
        cl_command_queue commandQueue = CL.clCreateCommandQueue(context, devices[0], properties, null);

        // Allocate the buffer memory objects
        cl_mem srcMemA = CL.clCreateBuffer(context, CL.CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n, srcA, null);
        cl_mem srcMemB = CL.clCreateBuffer(context, CL.CL_MEM_READ_ONLY | CL.CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n, srcB, null);
        cl_mem dstMem0 = CL.clCreateBuffer(context, CL.CL_MEM_READ_WRITE, Sizeof.cl_float * n, null, null);

        // Create and build the the programs and the kernels
        cl_program program0 = CL.clCreateProgramWithSource(context, 1, new String[]{ programSource0 }, null, null);

        // Build the programs
        CL.clBuildProgram(program0, 0, null, null, null, null);

        // Create the kernels
        cl_kernel kernel0 = CL.clCreateKernel(program0, "vectorAdd", null);

        // Set the arguments
        CL.clSetKernelArg(kernel0, 0, Sizeof.cl_mem, Pointer.to(srcMemA));
        CL.clSetKernelArg(kernel0, 1, Sizeof.cl_mem, Pointer.to(srcMemB));
        CL.clSetKernelArg(kernel0, 2, Sizeof.cl_mem, Pointer.to(dstMem0));

        // Set work-item dimensions and execute the kernels
        long globalWorkSize[] = new long[]{n};

        System.out.println("Enqueueing kernels...");
        cl_event[] myEventID = new cl_event[1];
        myEventID[0] = new cl_event();
        clEnqueueNDRangeKernel(commandQueue, kernel0, 1, null, globalWorkSize, null, 0, null, myEventID[0]);

        int ID[] = new int[1];
        ID[0] = 10;
        clSetEventCallback(myEventID[0], CL_COMPLETE, kernelCommandEvent, ID[0]);

        clFinish(commandQueue);
        System.out.println("Pause for 15000 ms.");
        try
        {
            Thread.sleep(15000);
        }
        catch(InterruptedException iEx)
        {
            iEx.printStackTrace();
        }

        // See if task completed
        int[] ok = new int[1];
        Arrays.fill(ok, 0);
        clGetEventInfo(myEventID[0], CL_EVENT_COMMAND_EXECUTION_STATUS, Sizeof.cl_int, Pointer.to(ok), null);
        if (ok[0] == CL_COMPLETE) System.out.println("Task COMPLETE");else System.out.println("Task INCOMPLETE");
    }
}

Answer 2

我认为我的想法并不是那么糟糕，因为实际上，你需要以编程方式强制主机切换到DEVICE工作，在这种情况下，HOST和DEVICE都是相同的硬件。

实际上，可以将HOST作为DEVICE，但是为了让DEVICE工作，你需要调用至少一个阻塞函数（clFinish（）或clEnqueueRead（... CL_TRUE，.. ））。否则，HOST将始终有效，永远不会切换到DEVICE工作。我试图添加一个sleep（）函数，但它不起作用，你真的需要添加一个阻塞的opencl函数。

无论如何，谢谢！

将HOST用作设备

2 个答案: