我有一个简单的openCL内核,名为Test_Kernel.cl
,可以从图像中读取。
const sampler_t sampler =
CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
void read(image3d_t image)
{
int z = get_global_id(0);
int y = get_global_id(1);
int x = get_global_id(2);
float value = read_imagef(image,sampler,(float4)(x,y,z,0)).s0;
}
__kernel void test(image3d_t d_testdata)
{
read(d_testdata);
}
和相应的pyopencl文件一起复制设备上的numpy数组,以便能够从中读取:
import pyopencl as cl
import numpy as np
#Setting up contexts, devices and queues.
platform = cl.get_platforms()[0]
devs = platform.get_devices()
device1 = devs[0]
ctx = cl.Context([device1])
queue = cl.CommandQueue(ctx)
queue2 = cl.CommandQueue(ctx)
#Defining testdata.
h_testdata = np.arange(4096*2).reshape((2,64,64)).astype(np.float32,order='C')
mf = cl.mem_flags
#Building the Kernel.
f = open('Test_Kernel.cl', 'r')
fstr = "".join(f.readlines())
prg = cl.Program(ctx, fstr).build()
test_knl = prg.test
def f():
d_testdata = cl.Image(ctx, mf.READ_ONLY, cl.ImageFormat(cl.channel_order.INTENSITY,cl.channel_type.FLOAT),h_testdata.shape)
wev1=cl.enqueue_copy(queue, d_testdata, h_testdata, is_blocking = False, origin = (0,0,0), region = h_testdata.shape)
test_knl.set_args(d_testdata)
cl.enqueue_nd_range_kernel(queue2,test_knl,(64,64,64),None,wait_for=[wev1])
f()
随着程序流程将越来越复杂,我正在使用事件和多个队列来关注排序和同步。但是,在我的分析器中,我可以看到,有时内核会执行 BEFORE 复制事件。我做错了什么?