我是opencl / pyopencl的新手,我正在尝试了解如何在2D数组上进行基本操作。以下是我尝试在numpy
中编写的典型openCL
函数的示例:
def fct_np(nx, nz, dx, dz, dt, rhou, rhov, rho):
tau11, tau22 = np.zeros((nx, nz)), np.zeros((nx, nz))
Erhou = rhou/rho
Erhov = rhov/rho
tau11[1:-1,:] = ( Erhou[2:,:] - Erhou[:-2,:] ) * dx
tau22[:,1:-1] = ( Erhov[:,2:] - Erhov[:,:-2] ) * dz
rhou = rhou - dt*tau11
rhov = rhov - dt*tau22
return rhou, rhov
和我尝试编写内核
#pragma OPENCL EXTENSION cl_amd_printf: enable
#define PADD (1)
__kernel void fct_cl(
const unsigned int nbx,
const unsigned int nbz,
const double dt,
const double dx,
const double dz,
__global double *rho_cl_in,
__global double *rhou_cl_in,
__global double *rhov_cl_in)
{
//Local and global idx
int gidx = get_global_id(0);
int gidz = get_global_id(1);
int lidx = get_local_id(0);
int lidz = get_local_id(1);
//Identification of workgroup
int i = get_group_id(0);
int j = get_group_id(1);
int grid_width = get_num_groups(0) * get_local_size(0) ;
int main_id = gidz * grid_width + gidx;
//Local variables
double tau11 = 0.0 ;
double tau22 = 0.0 ;
__local double Erhou[4+2*PADD][4+2*PADD] ;
__local double Erhov[4+2*PADD][4+2*PADD] ;
// dE/dx
Erhou[lidx+PADD][lidz+PADD] = rhou_cl_in[main_id] / rho_cl_in[main_id] ;
Erhov[lidx+PADD][lidz+PADD] = rhov_cl_in[main_id] / rho_cl_in[main_id] ;
// Left Edge
if( i >= 1 && lidx <= PADD-1) {
Erhou[lidx][lidz+PADD] = rhou_cl_in[ main_id-PADD] / rho_cl_in[ main_id-PADD] ;
Erhov[lidx][lidz+PADD] = rhov_cl_in[ main_id-PADD] / rho_cl_in[ main_id-PADD] ;
};
// Right Edge
if(i <= (int)get_num_groups(0)-2 && lidx >= 4-PADD ) {
Erhou[lidx+2*PADD][lidz+PADD] = rhou_cl_in[ main_id+PADD] / rho_cl_in[ main_id+PADD] ;
Erhov[lidx+2*PADD][lidz+PADD] = rhov_cl_in[ main_id+PADD] / rho_cl_in[ main_id+PADD] ;
};
// Bottom Edge
if( j >= 1 && lidz <= PADD-1) {
Erhou[lidx+PADD][lidz] = rhou_cl_in[ main_id-PADD*grid_width] / rho_cl_in[ main_id-PADD*grid_width] ;
Erhov[lidx+PADD][lidz] = rhov_cl_in[ main_id-PADD*grid_width] / rho_cl_in[ main_id-PADD*grid_width] ;
};
// Top Edge
if(j <= (int)get_num_groups(1)-2 && lidz >= 4-PADD ) {
Erhou[lidx+PADD][lidz+2*PADD] = rhou_cl_in[ main_id+PADD*grid_width]/ rho_cl_in[ main_id+PADD*grid_width] ;
Erhov[lidx+PADD][lidz+2*PADD] = rhov_cl_in[ main_id+PADD*grid_width]/ rho_cl_in[ main_id+PADD*grid_width] ;
};
//Sync
barrier(CLK_LOCAL_MEM_FENCE);
tau11 = 0.5 * dx * ( Erhou[lidx+PADD+1][lidz+PADD] - Erhou[lidx+PADD-1][lidz+PADD]) ;
tau22 = 0.5 * dz * ( Erhov[lidx+PADD][lidz+PADD+1] - Erhov[lidx+PADD][lidz+PADD-1]) ;
rhou_cl_in[main_id] = rhou_cl_in[main_id] - dt * tau11;
rhov_cl_in[main_id] = rhov_cl_in[main_id] - dt * tau22;
}
我用以下内容调用内核:
import pyopencl as cl
import numpy as np
# Init values
nx, nz = 1024, 256
dx, dz = 1/1e-4, 1/1e-4
dt = 1.5e-7
rho = np.random.rand(nx, nz)
rhou = np.random.rand(nx, nz)
rhov = np.random.rand(nx, nz)
### Create CL instance
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
### Initialize buffers
mf = cl.mem_flags
rhou_cl_in = cl.Buffer(ctx, mf.READ_WRITE |mf.COPY_HOST_PTR, hostbuf=rhou)
rhov_cl_in = cl.Buffer(ctx, mf.READ_WRITE |mf.COPY_HOST_PTR, hostbuf=rhov)
rho_cl_in = cl.Buffer(ctx, mf.READ_WRITE |mf.COPY_HOST_PTR, hostbuf=rho)
### Load kernel
kernel_code = open('fct_cl.cl', 'r').read()
### build kernel
prg = cl.Program(ctx, kernel_code).build()
### Execute
global_size = rho.shape
local_size = (2**2, 2**2)
ex = prg.fct_cl(queue, global_size, local_size, np.int32(nx), np.int32(nz), np.float64(dt), np.float64(dx), np.float64(dz), rho_cl_in, rhou_cl_in, rhov_cl_in)
### Init outputs
rhou_cl = np.empty_like(rhou)
rhov_cl = np.empty_like(rhov)
### Get ouput from GPU -> Host
cl.enqueue_copy(queue, rhou_cl, rhou_cl_in)
cl.enqueue_copy(queue, rhov_cl, rhov_cl_in)
rhou_np, rhov_np = fct_np(nx, nz, dx, dz, dt, rhou, rhov, rho)
print '--------------------------------'
print 'cl & np rhou equal ? ' + `np.array_equal(rhou_np, rhou_cl)`
print 'cl & np rhov equal ? ' + `np.array_equal(rhov_np, rhov_cl)`
Erhou
和Erhov
的计算是可以的。
问题来自于tau11
和tau22
的计算。
numpy
和opencl
的结果非常接近,但似乎opencl
中的行和列都被反转了!
我认为if
文件中cl
语句中的索引或域定义存在错误。
有什么想法吗?