对于IDCT,我有以下四个C
函数,这要归功于前一个问题的help I received。我希望将以下所有函数转换为OpenCL内核(并在GPU上运行它们),但我很难为第一个函数本身获得所需的响应。
功能1:
void IDCTforX(int32_t *input, uint8_t *output, int32_t input_inter[64]) {
int32_t Y[64];
int32_t k, l;
int32_t Yc[8];
for (k = 0; k < 8; k++) {
for (l = 0; l < 8; l++)
{
Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);
input_inter[(k<<3) + l]= Y(k,l);
}
}
}
功能2:
void IDCTfor1dim(int32_t *input, uint8_t *output, int32_t input_inter[64])
{
int32_t Y[64];
input_inter= Y;
int32_t k, l;
int32_t Yc[8];
for (k= 0; k < 8; k++)
{
idct_1d(&input_inter[(k << 3) + 0]);
}
}
功能3:
void IDCTforY(int32_t *input, uint8_t *output, int32_t output_inter[8]) {
int32_t Y[64];
int32_t k, l;
int32_t Yc[8][8];
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
Yc[l][k] = Y(k, l);
}
for (k = 0; k < 8; k++)
(output_inter[k]) = Y(k,l);
for (l = 0; l < 8; l++)
{
idct_1d(Yc[l]);
output_inter[k]= Yc[l][0];
}
}
功能4:
void IDCT_Descale(int32_t *input, uint8_t *output, int32_t output_inter[8]) {
int32_t Y[64];
int32_t k, l;
int32_t Yc[8][8];
//Running the loop for de-scaling separately....
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{
output_inter[k]= Yc[l][k];
}
}
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{
int32_t r = 128 + DESCALE(Yc[l][k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
对于函数1和2,我包含了一个名为input_inter[64]
的第三个变量,因为我想设置要更改为此变量的变量,以便更改的变量可以从一个函数传递到另一个函数。
我对函数1的OpenCL代码如下所示:
__kernel void IDCTForX(__global int *input, __global uchar *output, __global int Yin[64])
{
int Y[64];
unsigned int lin;
unsigned int k= get_global_id(0);
//unsigned int l= get_global_id(1);
if (k < 8)
{
for (lin=0; lin < 8; lin++)
{
Y(k, lin) = SCALE(input[(k << 3) + lin], S_BITS);
Yin[(k <<3) + lin]= Y(k, lin);
}
}
}
注意:我已将程序简化为仅关注与我的问题相关的部分。
我从我的.c
代码中调用此内核,如下所示:
cl_uchar *YCbCr_MCU_ds[3] = { NULL, NULL, NULL};
cl_int unZZ_MCU[64];
cl_int idct_out[8];
//For the third variable
cl_int Inter_DCT[64];
cosX_kernel= clCreateKernel(program, "IDCTForX", &ret);
cl_mem DCT_Intermediate = clCreateBuffer(context, CL_MEM_READ_WRITE, 64 * sizeof(cl_int), NULL, &ret);
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++) {
int component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++){
//unpack_block and iqzz...
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(cosX_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(cosX_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
ret |= clSetKernelArg(cosX_kernel, 2, sizeof(cl_mem), (void *)&DCT_Intermediate);
size_t globalForInverseDCT = 8;
ret = clEnqueueNDRangeKernel(command_queue, cosX_kernel, 1, NULL, &globalForInverseDCT, NULL, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, DCT_Intermediate, CL_TRUE, 0, 64 * sizeof(cl_int), &Inter_DCT, 0, NULL, NULL);
//Executing the other functions normally...
IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), Inter_DCT);
IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
IDCT_Descale(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
}
//Calling the up-sampling function........
}
//Color-conversion......
screen_cpyrect();
}
我希望这段代码能给我正确的输出,但我得不到正确的输出。我调试了我的代码,发现IDCTforX
中的第三个变量的变化方式与IDCTforX
的常规C代码相同。
在我的C
代码中,我正常调用该函数,并且输出正确:
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++) {
uint32_t component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV >> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
//unpack block and iqzz...
IDCTforX(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 *chroma_ss), idct_intermediate);
IDCTfor1dim(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_intermediate);
IDCTforY(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
IDCT_Descale(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), idct_out);
}
upsampler();
}
YCbCr_to_ARGB();
screen_cpyrect();
}
}
为什么我的OpenCL代码的行为方式与我的C代码不同?我的IDCTforX
内核代码有什么问题?