Question

我正在尝试使用OpenCL和image2d_t对象来加速图像卷积。当我注意到输出是全零的空白图像时，我将OpenCL内核简化为输入的基本读取并写入输出（如下所示）。通过一些调整，我得到它将图像的一些零散像素写入输出图像。

我已经验证了在OpenCL内核中调用read_imageui（）之前图像是完整的。我使用CommandQueue :: enqueueWriteImage（）将图像写入GPU内存，并立即使用CommandQueue :: enqueueReadImage（）将其读回CPU内存中的全新缓冲区。此调用的结果与原始输入图像匹配。但是，当我在内核中使用read_imageui（）检索像素时，绝大多数像素都设置为0.

C ++来源：

int height = 112;
int width = 9216;
unsigned int numPixels = height * width;
unsigned int numInputBytes = numPixels * sizeof(uint16_t);
unsigned int numDuplicatedInputBytes = numInputBytes * 4;
unsigned int numOutputBytes = numPixels * sizeof(int32_t);

cl::size_t<3> origin;
origin.push_back(0);
origin.push_back(0);
origin.push_back(0);
cl::size_t<3> region;
region.push_back(width);
region.push_back(height);
region.push_back(1);

std::ifstream imageFile("hri_vis_scan.dat", std::ifstream::binary);
checkErr(imageFile.is_open() ? CL_SUCCESS : -1, "hri_vis_scan.dat");
uint16_t *image = new uint16_t[numPixels];
imageFile.read((char *) image, numInputBytes);
imageFile.close();

// duplicate our single channel image into all 4 channels for Image2D
cl_ushort4 *imageDuplicated = new cl_ushort4[numPixels];
for (int i = 0; i < numPixels; i++)
    for (int j = 0; j < 4; j++)
        imageDuplicated[i].s[j] = image[i];

cl::Buffer imageBufferOut(context, CL_MEM_WRITE_ONLY, numOutputBytes, NULL, &err);
checkErr(err, "Buffer::Buffer()");

cl::ImageFormat inFormat;
inFormat.image_channel_data_type = CL_UNSIGNED_INT16;
inFormat.image_channel_order = CL_RGBA;
cl::Image2D bufferIn(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inFormat, width, height, 0, imageDuplicated, &err);
checkErr(err, "Image2D::Image2D()");

cl::ImageFormat outFormat;
outFormat.image_channel_data_type = CL_UNSIGNED_INT16;
outFormat.image_channel_order = CL_RGBA;
cl::Image2D bufferOut(context, CL_MEM_WRITE_ONLY, outFormat, width, height, 0, NULL, &err);
checkErr(err, "Image2D::Image2D()");

int32_t *imageResult = new int32_t[numPixels];
memset(imageResult, 0, numOutputBytes);

cl_int4 *imageResultDuplicated = new cl_int4[numPixels];
for (int i = 0; i < numPixels; i++)
    for (int j = 0; j < 4; j++)
        imageResultDuplicated[i].s[j] = 0;

std::ifstream kernelFile("convolutionKernel.cl");
checkErr(kernelFile.is_open() ? CL_SUCCESS : -1, "convolutionKernel.cl");
std::string imageProg(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources imageSource(1, std::make_pair(imageProg.c_str(), imageProg.length() + 1));
cl::Program imageProgram(context, imageSource);
err = imageProgram.build(devices, "");
checkErr(err, "Program::build()");

cl::Kernel basic(imageProgram, "basic", &err);
checkErr(err, "Kernel::Kernel()");

basic.setArg(0, bufferIn);
basic.setArg(1, bufferOut);
basic.setArg(2, imageBufferOut);

queue.finish();

cl_ushort4 *imageDuplicatedTest = new cl_ushort4[numPixels];
for (int i = 0; i < numPixels; i++)
{
    imageDuplicatedTest[i].s[0] = 0;
    imageDuplicatedTest[i].s[1] = 0;
    imageDuplicatedTest[i].s[2] = 0;
    imageDuplicatedTest[i].s[3] = 0;
}
double gpuTimer = clock();

err = queue.enqueueReadImage(bufferIn, CL_FALSE, origin, region, 0, 0, imageDuplicatedTest, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadImage()");

// Output from above matches input image

err = queue.enqueueNDRangeKernel(basic, cl::NullRange, cl::NDRange(height, width), cl::NDRange(1, 1), NULL, NULL);
checkErr(err, "CommandQueue::enqueueNDRangeKernel()");

queue.flush();

err = queue.enqueueReadImage(bufferOut, CL_TRUE, origin, region, 0, 0, imageResultDuplicated, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadImage()");

queue.flush();

err = queue.enqueueReadBuffer(imageBufferOut, CL_TRUE, 0, numOutputBytes, imageResult, NULL, NULL);
checkErr(err, "CommandQueue::enqueueReadBuffer()");

queue.finish();

OpenCL内核：

__kernel void basic(__read_only image2d_t input, __write_only image2d_t output, __global int *result)
{
const sampler_t smp = CLK_NORMALIZED_COORDS_TRUE | //Natural coordinates
     CLK_ADDRESS_NONE | //Clamp to zeros
     CLK_FILTER_NEAREST; //Don't interpolate

int2 coord = (get_global_id(1), get_global_id(0));

uint4 pixel = read_imageui(input, smp, coord);
result[coord.s0 + coord.s1 * 9216] = pixel.s0;
write_imageui(output, coord, pixel);
}

内核中的坐标当前映射到（x，y）=（宽度，高度）。

输入图像是单通道灰度图像，每像素16位，这就是我必须复制通道以适应OpenCL的Image2D的原因。卷积后的输出将是每像素32位，这就是numOutputBytes设置为的原因。此外，虽然宽度和高度看起来很奇怪，但输入图像的尺寸为9216x7824，因此我只是将其中一部分用于首先测试代码，因此不需要永远。

我从内核中的图像读取后添加了对全局内存的写入，以查看问题是读取图像还是写入图像。内核执行后，这部分全局内存也主要包含零。

非常感谢任何帮助！

Answer 1

read_imageui的文档说明了

此外，采用整数坐标的read_imagei和read_imageui调用必须使用规范化坐标设置为 CLK_NORMALIZED_COORDS_FALSE 且寻址模式设置为 CLK_ADDRESS_CLAMP_TO_EDGE，CLK_ADDRESS_CLAMP或CLK_ADDRESS_NONE 的采样器;否则返回的值是未定义的。

但是你正在创建一个带有CLK_NORMALIZED_COORDS_TRUE的采样器（但似乎是传入非标准化的坐标：S？）。

OpenCL image2d_t主要写零

1 个答案: