我正在尝试使用CUDA在2D中实现Navier-Stokes求解器。我正在使用雅可比的方法来解决差分方程组。我将代码划分为由16x16线程组成的4x4块。由于我的矩阵(尺寸为64x64)中的每个内点都需要其顶部,底部,左侧和右侧元素来计算其新值,因此我为每个块创建了一个18x18维度的新共享矩阵。我以这种方式将所有值读入矩阵 - 带索引(0,0)的线程将其值写入矩阵中的(1,1)元素,并且还将尝试读取其上方的元素和如果此访问权限未超过边界,则为左侧。读取完成后,我会更新所有内部点的值,然后将它们写回内存。
我最终在矩阵pn
中获取了垃圾值,即使所有值都已正确初始化。老实说,我看不出我哪里出错了。有人可以帮我这个吗?
我的内核 -
__global__ void red_psi (float *psi_o, float *psi_n, float *e, float *omega, float l1)
{
// m = n = 64
int i1 = blockIdx.x;
int j1 = blockIdx.y;
int i2 = threadIdx.x;
int j2 = threadIdx.y;
int i = (i1 * blockDim.x) + i2; // Actual row of the element
int j = (j1 * blockDim.y) + j2; // Actual column of the element
int l = i * n + j;
// e_XX --> variables refers to expanded shared memory location in order to accomodate halo elements
//Current Local ID with radius offset.
int e_li = i2 + 1;
int e_lj = j2 + 1;
// Variable pointing at top and bottom neighbouring location
int e_li_prev = e_li - 1;
int e_li_next = e_li + 1;
// Variable pointing at left and right neighbouring location
int e_lj_prev = e_lj - 1;
int e_lj_next = e_lj + 1;
__shared__ float po[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
__shared__ float pn[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
__shared__ float oo[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
//__shared__ float ee[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
if (i2 < 1) // copy top and bottom halo
{
//Copy Top Halo Element
if (blockIdx.y > 0) // Boundary check
{
po[i2][e_lj] = psi_o[l - n];
//pn[i2][e_lj] = psi_n[l - n];
oo[i2][e_lj] = omega[l - n];
//printf ("i_pn[%d][%d] = %f\n", i2, e_lj, oo[i2][e_lj]);
}
//Copy Bottom Halo Element
if (blockIdx.y < (gridDim.y - 1)) // Boundary check
{
po[1 + BLOCK_SIZE][e_lj] = psi_o[l + n];
//pn[1 + BLOCK_SIZE][e_lj] = psi_n[l + n];
oo[1 + BLOCK_SIZE][e_lj] = omega[l + n];
//printf ("j_pn[%d][%d] = %f\n", 1 + BLOCK_SIZE, e_lj, oo[1 + BLOCK_SIZE][e_lj]);
}
}
if (j2 < 1) // copy left and right halo
{
if (blockIdx.x > 0) // Boundary check
{
po[e_li][j2] = psi_o[l - 1];
//pn[e_li][j2] = psi_n[l - 1];
oo[e_li][j2] = omega[l - 1];
//printf ("k_pn[%d][%d] = %f\n", e_li, j2, oo[e_li][j2]);
}
if (blockIdx.x < (gridDim.x - 1)) // Boundary check
{
po[e_li][1 + BLOCK_SIZE] = psi_o[l + 1];
//pn[e_li][1 + BLOCK_SIZE] = psi_n[l + 1];
oo[e_li][1 + BLOCK_SIZE] = omega[l + 1];
//printf ("l_pn[%d][%d] = %f\n", e_li, 1 + BLOCK_SIZE, oo[e_li][BLOCK_SIZE + 1]);
}
}
// copy current location
po[e_li][e_lj] = psi_o[l];
//pn[e_li][e_lj] = psi_n[l];
oo[e_li][e_lj] = omega[l];
//printf ("o_pn[%d][%d] = %f\n", e_li, e_lj, oo[e_li][e_lj]);
__syncthreads ();
// Checking whether we have an internal point.
if ((i >= 1 && i < (m - 1)) && (j >= 1 && j < (n - 1)))
{
//printf ("Calculating for - (%d, %d)\n", i, j);
pn[e_li][e_lj] = 0.25 * (po[e_li_next][e_lj] + po[e_li_prev][e_lj] + po[e_li][e_lj_next] + po[e_li][e_lj_prev] + h*h*oo[e_li][e_lj]);
//printf ("n_pn[%d][%d] (%d, %d), a(%d, %d) = %f\n", e_li_prev, e_lj, i1, j1, i, j, po[e_li_prev][e_lj]);
pn[e_li][e_lj] = po[e_li][e_lj] + 1.0 * (pn[e_li][e_lj] - po[e_li][e_lj]);
__syncthreads ();
psi_n[l] = pn[e_li][e_lj];
e[l] = po[e_li][e_lj] - pn[e_li][e_lj];
}
}
这就是我调用内核的方法 -
dim3 threadsPerBlock (4, 4);
dim3 numBlocks (4, 4);
red_psi<<<numBlocks, threadsPerBlock>>> (d_xn, d_xx, d_e, d_w, l1);
(d_xx, d_xn, d_e, d_w
是所有float
数组,大小为4096)
答案 0 :(得分:1)
当我复制顶部/底部和左/右光环元素时,我切换了blockDim.x和blockDim.y。