如何使用PyOpenCl并行运行两个GPU

时间:2016-11-18 10:28:38

标签: python opencl pyopencl

以下是我在python中的工作代码。我使用Pyopencl来使用GPU。我有一个2013年末mac pro,它有2个AMD firepro GPU。我想并行运行两个GPU。有人可以告诉我我该怎么做吗? 我尝试将queue属性设置为properties = cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE但它给出了无效的命令错误。

def Unification_GPU_2(FBNs_C1, FBNs_C2, NOT_C1, NOT_C2, size_FBNs_C1, size_FBNs_C2):  # Unifying FBNs of same class if they are same
    platform = cl.get_platforms()  # gets all platforms that exist on this machine
    device = platform[0].get_devices(device_type=cl.device_type.GPU)  # gets all GPU's that exist on first platform from platform list
    context = cl.Context()  #devices=[device[:]] # Creates context for all devices in the list of "device" from above. context.num_devices give number of devices in this context

    len_del_array_c1 = numpy.int32(size_FBNs_C1)
    len_del_array_c2 = numpy.int32(size_FBNs_C2)
    # print("everything good so far")


    ####   KERNEL CODE in C. [IP + 1FBN] = 1 thread
    # print('ready to run Kernel. Compiling .....')
    program = cl.Program(context, """
        __kernel void Kernel_FBNs_Retained(__global int *FBNs_array, __global int *NOT_array,__global int *size_FBNs,__global int *c1_c2,__global int *del_array)

        {


            int i1 = get_global_id(0); // thread_global_id=i1

            //FBN.......COPYING A FBN[i1] FROM ALL FBNS  TO PRIVATE FBN FOR OUR THREAD since its going to be used many times
            int FBN[20][20];
            for (int j1=0; j1<20;j1++) //Number of parameters
            {
            for (int j2=0; j2<20;j2++) // Number of bits per parameter
                {
                FBN[j1][j2]=FBNs_array[(i1*400)+(j1*20)+j2];
                }
            }


            //NOT.... Coyping NOT for FBNs[thread] coz we use it often
            int NOT[2];
            for (int j1 =0; j1<2; j1++)
                {
                    NOT[j1]=NOT_array[(i1*2)+j1];
                }

            int del_array_value=1;


            for  (int i2=0; i2<size_FBNs[0];i2++)  // FBN[thread] will check with all other FBNs to see if any FBN_i1=FBN_i2
            {
                //Checking if FBNi1=FBNi2
                int FBNs_are_same=1; // 0 = false , not same. 1= true, they are same
                for (int j1=0; j1<20;j1++) //Number of parameters
                {
                    for (int j2=0; j2<20;j2++) // Number of bits per parameter
                        {
                            if (FBN[j1][j2]!=FBNs_array[(i2*400)+(j1*20)+j2])
                                {
                                    FBNs_are_same--;
                                    break;
                                }
                        }
                    if (FBNs_are_same==0)
                        { break;}
                }

                if (FBNs_are_same==1)
                {
                    if ( (NOT[c1_c2[0]] < NOT_array[ (i2*2)+(c1_c2[0])] ) || (  ( NOT[c1_c2[0]] == NOT_array[(i2*2)+(c1_c2[0])] ) && (i1<i2) ) )
                    {
                        del_array_value--;
                        break;
                    }
                }

            }

                del_array[i1]=del_array_value;

        }
                """).build()
    queue_1 = cl.CommandQueue(context,device=device[0])  # ,properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE
    queue_2 = cl.CommandQueue(context, device=device[1])
    mem_flags = cl.mem_flags

    # print("Unification_GPU")

    # C1

    # READ VARIABLES
    FBNs_array_buf_C1 = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=FBNs_C1)
    NOT_array_buf_C1 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=NOT_C1)
    c1 = numpy.array([0], numpy.int32)
    c1_c2_buf_C1 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=c1)
    size_FBNs = numpy.array(len_del_array_c1, numpy.int32)
    size_FBNs_buf_c1 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=size_FBNs)

    # WRITE VARIABLES
    del_array_C1 = numpy.ones((len_del_array_c1), numpy.int32)
    del_array_buf_C1 = cl.Buffer(context, mem_flags.WRITE_ONLY, del_array_C1.nbytes)


    # C2

    # READ VARIABLES
    FBNs_array_buf_C2 = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=FBNs_C2)
    NOT_array_buf_C2 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=NOT_C2)
    c2 = numpy.array([1], numpy.int32)
    c1_c2_buf_C2 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=c2)
    size_FBNs = numpy.array(len_del_array_c2, numpy.int32)
    size_FBNs_buf_C2 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=size_FBNs)

    # WRITE VARIABLES
    del_array_C2 = numpy.ones(len_del_array_c2, numpy.int32)
    del_array_buf_C2 = cl.Buffer(context, mem_flags.WRITE_ONLY, del_array_C2.nbytes)

    print("Unification_GPU Kernel 1 running")
    program.Kernel_FBNs_Retained(queue_1, (len_del_array_c1,), None, FBNs_array_buf_C1, NOT_array_buf_C1, size_FBNs_buf_c1, c1_c2_buf_C1, del_array_buf_C1)
    print("Unification_GPU Kernel 1 done")

    print("Unification_GPU Kernel 2 running")
    program.Kernel_FBNs_Retained(queue_2, (len_del_array_c2,), None, FBNs_array_buf_C2, NOT_array_buf_C2, size_FBNs_buf_C2, c1_c2_buf_C2, del_array_buf_C2)
    print("Unification_GPU Kernel 2 done")
    # print("loading outputs from GPU to CPU")




    cl.enqueue_copy(queue_1, del_array_C1, del_array_buf_C1)
    cl.enqueue_copy(queue_2, del_array_C2, del_array_buf_C2)

    # print('Done')

    return (del_array_C1, del_array_C2)

0 个答案:

没有答案