我没有成功将以下C函数转换为OpenCL内核。
void Function2(int32_t *input, uint8_t *output, int32_t input_inter[64])
{
int32_t Y[64];
for (int i = 0; i < 64; i++)
Y[i]= input_inter[i];
int32_t k, l;
for (k= 0; k < 8; k++)
{
idct_1d(&Y(k,0));
}
for (l=0; l < 64; l++)
{
printf("c%d: %d ", l, Y[l]);
}
}
我的上述函数的OpenCL内核(不能正常工作)如下:
__kernel void GPU_Function2(__global int *input, __global uchar *output, __global int Yin[64]){
int Y[64];
for (int i= 0; i < 64; i++)
{
Y[i]= Yin[i];
}
unsigned int k= get_global_id(0);
int Yc[8];
if (k < 8)
{
idct_1D(&Y(k,0));
}
for (int r= 0; r < 64; r++)
{
Yin[r]= Y[r];
}
}
}
参数3包含我需要的值,理想情况下,我会通过idct_1D
传递它以获得预期的输出。但是,idct_1D
仅接受私有变量,因此我将Yin
中Function_2
的内容转移到Y[64]
。在那之后,我可以看到,我在Y[64]
上做了idct,然后在Yin[64]
内放回了新值,并期望看到与我的C代码相同的输出。但这没效果。
请建议我如何获得Function_2
的OpenCL代码所需的行为。
-------------------------------------------- ---------
上面的函数从下面的OpenCL内核获得它的第三个输入,它可以正常工作。
__kernel void GPU_Function1(__global int *input, __global uchar *output, __global int Yin[64])
{
int Y[64];
unsigned int lin;
unsigned int k= get_global_id(0);
//unsigned int l= get_global_id(1);
if (k < 8)
{
for (lin=0; lin < 8; lin++)
{
Y(k, lin) = SCALE(input[(k << 3) + lin], S_BITS);
Yin[(k <<3) + lin]= Y(k, lin);
}
}
}
我的Function1
和Function2
的OpenCL主机代码如下:
//Executing Function_1 on the GPU....
ret = clSetKernelArg(Func1_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(Func1_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
ret |= clSetKernelArg(Func1_kernel, 2, sizeof(cl_mem), (void *)&DCT_Intermediate);
size_t globalForInverseDCT = 8;
ret = clEnqueueNDRangeKernel(command_queue, Func1_kernel, 1, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, DCT_Intermediate, CL_TRUE, 0, 64 * sizeof(cl_int), &Inter_DCT, 0, NULL, NULL);
printf("----GPU output for Function1 --------\n");
for (int w=0; w < 64; w++)
{
printf("g%d: %d ", w, Inter_DCT[w]);
}
//Executing Function_2 on the GPU......
ret = clSetKernelArg(Func2_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(Func2_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
ret |= clSetKernelArg(Func2_kernel, 2, sizeof(cl_mem), (void*)&DCT_Intermediate);
size_t InverseDCT_1Dim= 8;
ret = clEnqueueNDRangeKernel(command_queue, Func2_kernel, 1, NULL, &InverseDCT_1Dim, NULL, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, DCT_Intermediate, CL_TRUE, 0, 64 * sizeof(cl_int), &Inter_DCT, 0, NULL, NULL);
printf("-----GPU output for Function 2---------\n");
for (int x= 0; x < 64; x++)
{
printf("g%d: %d ", x, Inter_DCT[x]);
}
exit(1);
////The remaining code.........
Function_1的输出缓冲区的结果如下:
----GPU output for Function1 --------
g0: -1600 g1: 0 g2: 0 g3: 0 g4: 0 g5: 0 g6: 0 g7: 0 g8: -3408 g9: 0 g10: 0 g11: 0 g12: 0 g13: 0 g14: 0 g15: 0 g16: -1960 g17: 0 g18: 0 g19: 0 g20: 0 g21: 0 g22: 0 g23: 0 g24: -336 g25: 0 g26: 0 g27: 0
g28: 0 g29: 0 g30: 0 g31: 0 g32: 648 g33: 0 g34: 0 g35: 0 g36: 0 g37: 0 g38: 0 g39: 0 g40: 672 g41: 0 g42: 0 g43: 0 g44: 0 g45: 0 g46: 0 g47: 0 g48: 200 g49: 0 g50: 0 g51: 0 g52: 0 g53: 0 g54: 0 g55: 0 g56: -288 g57: 0 g58: 0 g59: 0 g60: 0 g61: 0 g62: 0 g63: 0
Function_2
的CORRECT输出(我运行正常的C代码时得到的)是这样的:
c0: -1600 c1: -1600 c2: -1600 c3: -1600 c4: -1600 c5: -1600 c6: -1600 c7: -1600 c8: -3408 c9: -3408 c10: -3408 c11: -3408 c12: -3408 c13: -3408 c14: -3408 c15: -3408 c16: -1960 c17: -1960 c18: -1960 c19: -1960 c20: -1960
c21: -1960 c22: -1960 c23: -1960 c24: -336 c25: -336 c26: -336 c27: -336 c28: -336 c29: -336 c30: -336 c31: -336 c32: 648 c33: 648 c34: 648 c35: 648 c36: 648 c37: 648 c38: 648 c39: 648 c40: 672 c41: 672 c42: 672 c43: 672 c44: 672 c45: 672 c46: 672 c47: 672 c48: 200 c49: 200 c50: 200 c51: 200 c52: 200 c53: 200 c54: 200 c55: 200 c56: -288 c57: -288 c58: -288 c59: -288 c60: -288 c61: -288 c62: -288 c63: -288
但OpenCL等效于Function_2
会产生以下输出,这与C代码不同并且不正确:
-----GPU output for Function 2---------
g0: -1600 g1: 0 g2: 0 g3: 0 g4: 0 g5: 0 g6: 0 g7: 0 g8: -3408 g9: 0 g10: 0 g11: 0 g12: 0 g13: 0 g14: 0 g15: 0 g16: -1960 g17: 0 g18: 0 g19: 0 g20: 0 g21: 0 g22: 0 g23: 0 g24: -336 g25: 0 g26: 0 g27: 0 g28: 0
g29: 0 g30: 0 g31: 0 g32: 648 g33: 0 g34: 0 g35: 0 g36: 0 g37: 0 g38: 0 g39: 0 g40: 672 g41: 0 g42: 0 g43: 0 g44: 0 g45: 0 g46: 0 g47: 0 g48: 200 g49: 0 g50: 0 g51: 0 g52: 0 g53: 0 g54: 0 g55: 0 g56: -288 g57: -288 g58: -288 g59: -288 g60: -288 g61: -288 g62: -288 g63: -288
我的idct_1D
文件中的 .cl
如下所示:
void idct_1D(int *Y)
{
int z1[8], z2[8], z3[8];
but(Y[0], Y[4], z1[1], z1[0]);
rot(1, 6, Y[2], Y[6], &z1[2], &z1[3]);
but(Y[1], Y[7], z1[4], z1[7]);
z1[5] = CMUL(sqrt2, Y[3]);
z1[6] = CMUL(sqrt2, Y[5]);
but(z1[0], z1[3], z2[3], z2[0]);
but(z1[1], z1[2], z2[2], z2[1]);
but(z1[4], z1[6], z2[6], z2[4]);
but(z1[7], z1[5], z2[5], z2[7]);
z3[0] = z2[0];
z3[1] = z2[1];
z3[2] = z2[2];
z3[3] = z2[3];
rot(0, 3, z2[4], z2[7], &z3[4], &z3[7]);
rot(0, 1, z2[5], z2[6], &z3[5], &z3[6]);
but(z3[0], z3[7], Y[7], Y[0]);
but(z3[1], z3[6], Y[6], Y[1]);
but(z3[2], z3[5], Y[5], Y[2]);
but(z3[3], z3[4], Y[4], Y[3]);
}
我将idct.c
.cl
复制到我的mutation do
@doc """
deletes a photo in the database
"""
field :photo_delete, list_of(:photo) do
arg :id, non_null(:id)
resolve &Resolvers.Posts.photo_delete/3
end
end
文件中。