Question

我是opencl / pyopencl的新手，我正在尝试了解如何在2D数组上进行基本操作。以下是我尝试在numpy中编写的典型openCL函数的示例：

def fct_np(nx, nz, dx, dz, dt, rhou, rhov, rho):

    tau11, tau22 = np.zeros((nx, nz)), np.zeros((nx, nz))

    Erhou = rhou/rho
    Erhov = rhov/rho

    tau11[1:-1,:] = ( Erhou[2:,:] - Erhou[:-2,:] ) * dx
    tau22[:,1:-1] = ( Erhov[:,2:] - Erhov[:,:-2] ) * dz

    rhou = rhou - dt*tau11
    rhov = rhov - dt*tau22

    return rhou, rhov

和我尝试编写内核

#pragma OPENCL EXTENSION cl_amd_printf: enable
#define PADD (1)

__kernel void fct_cl(       
        const unsigned int nbx, 
        const unsigned int nbz,
        const double dt,
        const double dx,
        const double dz, 

        __global double *rho_cl_in,
        __global double *rhou_cl_in,
        __global double *rhov_cl_in) 
  {
  //Local and global idx
  int gidx = get_global_id(0); 
  int gidz = get_global_id(1); 
  int lidx = get_local_id(0); 
  int lidz = get_local_id(1); 

  //Identification of workgroup
  int i = get_group_id(0); 
  int j = get_group_id(1); 

  int grid_width = get_num_groups(0) * get_local_size(0) ;
  int main_id = gidz * grid_width + gidx; 

  //Local variables
  double tau11 = 0.0 ;
  double tau22 = 0.0 ;
  __local double Erhou[4+2*PADD][4+2*PADD] ;
  __local double Erhov[4+2*PADD][4+2*PADD] ;


    // dE/dx
    Erhou[lidx+PADD][lidz+PADD] = rhou_cl_in[main_id] / rho_cl_in[main_id] ;
    Erhov[lidx+PADD][lidz+PADD] = rhov_cl_in[main_id] / rho_cl_in[main_id] ;

    // Left Edge
    if( i >= 1 && lidx <= PADD-1) {
      Erhou[lidx][lidz+PADD] = rhou_cl_in[ main_id-PADD] / rho_cl_in[ main_id-PADD] ;
      Erhov[lidx][lidz+PADD] = rhov_cl_in[ main_id-PADD] / rho_cl_in[ main_id-PADD] ;
    };

    // Right Edge
    if(i <= (int)get_num_groups(0)-2 && lidx >= 4-PADD ) {
      Erhou[lidx+2*PADD][lidz+PADD] = rhou_cl_in[ main_id+PADD] / rho_cl_in[ main_id+PADD] ;
      Erhov[lidx+2*PADD][lidz+PADD] = rhov_cl_in[ main_id+PADD] / rho_cl_in[ main_id+PADD] ;
     };

    // Bottom Edge
    if( j >= 1 && lidz <= PADD-1) {
      Erhou[lidx+PADD][lidz] = rhou_cl_in[ main_id-PADD*grid_width] / rho_cl_in[ main_id-PADD*grid_width] ;
      Erhov[lidx+PADD][lidz] = rhov_cl_in[ main_id-PADD*grid_width] / rho_cl_in[ main_id-PADD*grid_width] ;
    };

    // Top Edge
    if(j <= (int)get_num_groups(1)-2 && lidz >= 4-PADD ) {
      Erhou[lidx+PADD][lidz+2*PADD] = rhou_cl_in[ main_id+PADD*grid_width]/ rho_cl_in[ main_id+PADD*grid_width] ;
      Erhov[lidx+PADD][lidz+2*PADD] = rhov_cl_in[ main_id+PADD*grid_width]/ rho_cl_in[ main_id+PADD*grid_width] ;
     };

  //Sync
  barrier(CLK_LOCAL_MEM_FENCE);

  tau11 =  0.5 * dx * ( Erhou[lidx+PADD+1][lidz+PADD] - Erhou[lidx+PADD-1][lidz+PADD]) ;
  tau22 =  0.5 * dz * ( Erhov[lidx+PADD][lidz+PADD+1] - Erhov[lidx+PADD][lidz+PADD-1]) ;

  rhou_cl_in[main_id] = rhou_cl_in[main_id] - dt * tau11;   
  rhov_cl_in[main_id] = rhov_cl_in[main_id] - dt * tau22;
  }

我用以下内容调用内核：

import pyopencl as cl
import numpy as np

# Init values
nx, nz = 1024, 256
dx, dz = 1/1e-4, 1/1e-4
dt = 1.5e-7
rho  = np.random.rand(nx, nz)
rhou = np.random.rand(nx, nz)
rhov = np.random.rand(nx, nz)

### Create CL instance
ctx = cl.create_some_context()

queue = cl.CommandQueue(ctx,
       properties=cl.command_queue_properties.PROFILING_ENABLE)  

### Initialize buffers
mf = cl.mem_flags
rhou_cl_in = cl.Buffer(ctx, mf.READ_WRITE |mf.COPY_HOST_PTR, hostbuf=rhou)
rhov_cl_in = cl.Buffer(ctx, mf.READ_WRITE |mf.COPY_HOST_PTR, hostbuf=rhov)
rho_cl_in  = cl.Buffer(ctx, mf.READ_WRITE |mf.COPY_HOST_PTR, hostbuf=rho)

### Load kernel
kernel_code = open('fct_cl.cl', 'r').read()

### build kernel
prg = cl.Program(ctx, kernel_code).build()

### Execute 
global_size = rho.shape
local_size  =  (2**2, 2**2)
ex = prg.fct_cl(queue, global_size, local_size, np.int32(nx), np.int32(nz), np.float64(dt), np.float64(dx), np.float64(dz), rho_cl_in, rhou_cl_in, rhov_cl_in)

### Init outputs
rhou_cl  = np.empty_like(rhou)
rhov_cl  = np.empty_like(rhov)

### Get ouput from GPU -> Host
cl.enqueue_copy(queue, rhou_cl, rhou_cl_in)
cl.enqueue_copy(queue, rhov_cl, rhov_cl_in)

rhou_np, rhov_np = fct_np(nx, nz, dx, dz, dt, rhou, rhov, rho)

print '--------------------------------'
print 'cl & np rhou equal ? ' + `np.array_equal(rhou_np, rhou_cl)`
print 'cl & np rhov equal ? ' + `np.array_equal(rhov_np, rhov_cl)`

Erhou和Erhov的计算是可以的。

问题来自于tau11和tau22的计算。

numpy和opencl的结果非常接近，但似乎opencl中的行和列都被反转了！我认为if文件中cl语句中的索引或域定义存在错误。

有什么想法吗？

使用OpenCL / pyOpenCL对2D阵列进行操作

0 个答案: