以下是我在python中的工作代码。我使用Pyopencl来使用GPU。我有一个2013年末mac pro,它有2个AMD firepro GPU。我想并行运行两个GPU。有人可以告诉我我该怎么做吗? 我尝试将queue属性设置为properties = cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE但它给出了无效的命令错误。
def Unification_GPU_2(FBNs_C1, FBNs_C2, NOT_C1, NOT_C2, size_FBNs_C1, size_FBNs_C2): # Unifying FBNs of same class if they are same
platform = cl.get_platforms() # gets all platforms that exist on this machine
device = platform[0].get_devices(device_type=cl.device_type.GPU) # gets all GPU's that exist on first platform from platform list
context = cl.Context() #devices=[device[:]] # Creates context for all devices in the list of "device" from above. context.num_devices give number of devices in this context
len_del_array_c1 = numpy.int32(size_FBNs_C1)
len_del_array_c2 = numpy.int32(size_FBNs_C2)
# print("everything good so far")
#### KERNEL CODE in C. [IP + 1FBN] = 1 thread
# print('ready to run Kernel. Compiling .....')
program = cl.Program(context, """
__kernel void Kernel_FBNs_Retained(__global int *FBNs_array, __global int *NOT_array,__global int *size_FBNs,__global int *c1_c2,__global int *del_array)
{
int i1 = get_global_id(0); // thread_global_id=i1
//FBN.......COPYING A FBN[i1] FROM ALL FBNS TO PRIVATE FBN FOR OUR THREAD since its going to be used many times
int FBN[20][20];
for (int j1=0; j1<20;j1++) //Number of parameters
{
for (int j2=0; j2<20;j2++) // Number of bits per parameter
{
FBN[j1][j2]=FBNs_array[(i1*400)+(j1*20)+j2];
}
}
//NOT.... Coyping NOT for FBNs[thread] coz we use it often
int NOT[2];
for (int j1 =0; j1<2; j1++)
{
NOT[j1]=NOT_array[(i1*2)+j1];
}
int del_array_value=1;
for (int i2=0; i2<size_FBNs[0];i2++) // FBN[thread] will check with all other FBNs to see if any FBN_i1=FBN_i2
{
//Checking if FBNi1=FBNi2
int FBNs_are_same=1; // 0 = false , not same. 1= true, they are same
for (int j1=0; j1<20;j1++) //Number of parameters
{
for (int j2=0; j2<20;j2++) // Number of bits per parameter
{
if (FBN[j1][j2]!=FBNs_array[(i2*400)+(j1*20)+j2])
{
FBNs_are_same--;
break;
}
}
if (FBNs_are_same==0)
{ break;}
}
if (FBNs_are_same==1)
{
if ( (NOT[c1_c2[0]] < NOT_array[ (i2*2)+(c1_c2[0])] ) || ( ( NOT[c1_c2[0]] == NOT_array[(i2*2)+(c1_c2[0])] ) && (i1<i2) ) )
{
del_array_value--;
break;
}
}
}
del_array[i1]=del_array_value;
}
""").build()
queue_1 = cl.CommandQueue(context,device=device[0]) # ,properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE
queue_2 = cl.CommandQueue(context, device=device[1])
mem_flags = cl.mem_flags
# print("Unification_GPU")
# C1
# READ VARIABLES
FBNs_array_buf_C1 = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=FBNs_C1)
NOT_array_buf_C1 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=NOT_C1)
c1 = numpy.array([0], numpy.int32)
c1_c2_buf_C1 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=c1)
size_FBNs = numpy.array(len_del_array_c1, numpy.int32)
size_FBNs_buf_c1 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=size_FBNs)
# WRITE VARIABLES
del_array_C1 = numpy.ones((len_del_array_c1), numpy.int32)
del_array_buf_C1 = cl.Buffer(context, mem_flags.WRITE_ONLY, del_array_C1.nbytes)
# C2
# READ VARIABLES
FBNs_array_buf_C2 = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=FBNs_C2)
NOT_array_buf_C2 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=NOT_C2)
c2 = numpy.array([1], numpy.int32)
c1_c2_buf_C2 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=c2)
size_FBNs = numpy.array(len_del_array_c2, numpy.int32)
size_FBNs_buf_C2 = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=size_FBNs)
# WRITE VARIABLES
del_array_C2 = numpy.ones(len_del_array_c2, numpy.int32)
del_array_buf_C2 = cl.Buffer(context, mem_flags.WRITE_ONLY, del_array_C2.nbytes)
print("Unification_GPU Kernel 1 running")
program.Kernel_FBNs_Retained(queue_1, (len_del_array_c1,), None, FBNs_array_buf_C1, NOT_array_buf_C1, size_FBNs_buf_c1, c1_c2_buf_C1, del_array_buf_C1)
print("Unification_GPU Kernel 1 done")
print("Unification_GPU Kernel 2 running")
program.Kernel_FBNs_Retained(queue_2, (len_del_array_c2,), None, FBNs_array_buf_C2, NOT_array_buf_C2, size_FBNs_buf_C2, c1_c2_buf_C2, del_array_buf_C2)
print("Unification_GPU Kernel 2 done")
# print("loading outputs from GPU to CPU")
cl.enqueue_copy(queue_1, del_array_C1, del_array_buf_C1)
cl.enqueue_copy(queue_2, del_array_C2, del_array_buf_C2)
# print('Done')
return (del_array_C1, del_array_C2)