Question

我想使用GPU内核在卷上执行双阈值。我将每个切片的音量发送为read_only image2d_t。我的输出音量是二进制音量，其中每个位指定是否启用或禁用其相关的体素。我的内核检查当前像素值是否在下限/上限阈值范围内，并在二进制卷中启用其对应的位。

出于调试目的，我暂时留下了实际检查。我只是使用传递的切片nr来确定二进制卷位是应该打开还是关闭。前14个切片设置为＆＃34; on＆＃34;其余为＆＃34; off＆＃34;。我还在CPU端验证了这段代码，我在这篇文章的底部粘贴了代码。代码显示了两个路径，现在正在评论CPU。

CPU代码按预期工作，在应用二进制掩码渲染卷后返回以下图像：

Rendering with a correct computed mask

使用我的GPU内核运行完全相同的逻辑会返回不正确的结果（第1张3D，第2张切片视图）：

Rendering with an incorrect GPU computed mask

Rendering with an incorrect GPU computed mask (sliceview)

这里出了什么问题？我读到OpenCL不支持位字段，但它确实支持按位运算符，据我所知，从OpenCL规范。我的位逻辑，从32位字中选择正确的位并翻转它，是否支持？或者我的简单旗帜被认为是一个领域。它的作用是从左边选择体素％32位（不是右边，因此减去）。

另一件事可能是传递给我的内核的uint指针与我的期望不同。我假设这将有效地使用指针并将数据传递给我的内核。应用于＆＃34; uint * word＆＃34;内核中的部分是由于每行填充字和每个片的填充行。 CPU变体确认指针计算逻辑有效。

下面;代码

            uint wordsPerRow = (uint)BinaryVolumeWordsPerRow(volume.Geometry.NumberOfVoxels);
            uint wordsPerPlane = (uint)BinaryVolumeWordsPerPlane(volume.Geometry.NumberOfVoxels);

            int[] dims = new int[3];
            dims[0] = volume.Geometry.NumberOfVoxels.X;
            dims[1] = volume.Geometry.NumberOfVoxels.Y;
            dims[2] = volume.Geometry.NumberOfVoxels.Z;

            uint[] arrC = dstVolume.BinaryData.ObtainArray() as uint[];
            unsafe {
                fixed(int* dimPtr = dims) {
                    fixed(uint *arrcPtr = arrC) {
                        // pick Cloo Platform
                        ComputePlatform platform = ComputePlatform.Platforms[0];

                        // create context with all gpu devices
                        ComputeContext context = new ComputeContext(ComputeDeviceTypes.Gpu,
                            new ComputeContextPropertyList(platform), null, IntPtr.Zero);

                        // load opencl source
                        StreamReader streamReader = new StreamReader(@"C:\views\pii-sw113v1\PMX\ADE\Philips\PmsMip\Private\Viewing\Base\BinaryVolumes\kernels\kernel.cl");
                        string clSource = streamReader.ReadToEnd();
                        streamReader.Close();

                        // create program with opencl source
                        ComputeProgram program = new ComputeProgram(context, clSource);

                        // compile opencl source
                        program.Build(null, null, null, IntPtr.Zero);

                        // Create the event wait list. An event list is not really needed for this example but it is important to see how it works.
                        // Note that events (like everything else) consume OpenCL resources and creating a lot of them may slow down execution.
                        // For this reason their use should be avoided if possible.
                        ComputeEventList eventList = new ComputeEventList();

                        // Create the command queue. This is used to control kernel execution and manage read/write/copy operations.
                        ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None);

                        // Create the kernel function and set its arguments.
                        ComputeKernel kernel = program.CreateKernel("LowerThreshold");

                        int slicenr = 0;
                        foreach (IntPtr ptr in pinnedSlices) {
                            /*// CPU VARIANT FOR TESTING PURPOSES 
                            for (int y = 0; y < dims[1]; y++) {
                                for (int x = 0; x < dims[0]; x++) {
                                    long pixelOffset = x + y * dims[0];
                                    ushort* ushortPtr = (ushort*)ptr;
                                    ushort pixel = *(ushortPtr + pixelOffset);

                                    int BinaryWordShift = 5;
                                    int BinaryWordBits = 32;
                                    if (
                                        (0 <= x) &&
                                        (0 <= y) &&
                                        (0 <= slicenr) &&
                                        (x < dims[0]) &&
                                        (y < dims[1]) &&
                                        (slicenr < dims[2])
                                    ) {
                                        uint* word =
                                            arrcPtr + 1 + (slicenr * wordsPerPlane) +
                                            (y * wordsPerRow) +
                                            (x >> BinaryWordShift);
                                        uint mask = (uint)(0x1 << ((BinaryWordBits - 1) - (byte)(x & 0x1f)));
                                        //if (pixel > lowerThreshold && pixel < upperThreshold) {
                                        if (slicenr < 15) {
                                            *word |= mask;
                                        } else {
                                            *word &= ~mask;
                                        }
                                    }
                                }
                            }*/

                            ComputeBuffer<int> dimsBuffer = new ComputeBuffer<int>(
                                context,
                                ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer,
                                3,
                                new IntPtr(dimPtr));

                            ComputeImageFormat format = new ComputeImageFormat(ComputeImageChannelOrder.Intensity, ComputeImageChannelType.UnsignedInt16);
                            ComputeImage2D image2D = new ComputeImage2D(
                                context, 
                                ComputeMemoryFlags.ReadOnly, 
                                format, 
                                volume.Geometry.NumberOfVoxels.X, 
                                volume.Geometry.NumberOfVoxels.Y, 
                                0, 
                                ptr
                            );

                            // The output buffer doesn't need any data from the host. Only its size is specified (arrC.Length).
                            ComputeBuffer<uint> c = new ComputeBuffer<uint>(
                                context, ComputeMemoryFlags.WriteOnly, arrC.Length);

                            kernel.SetMemoryArgument(0, image2D);
                            kernel.SetMemoryArgument(1, dimsBuffer);
                            kernel.SetValueArgument(2, wordsPerRow);
                            kernel.SetValueArgument(3, wordsPerPlane);
                            kernel.SetValueArgument(4, slicenr);
                            kernel.SetValueArgument(5, lowerThreshold);
                            kernel.SetValueArgument(6, upperThreshold);
                            kernel.SetMemoryArgument(7, c);

                            // Execute the kernel "count" times. After this call returns, "eventList" will contain an event associated with this command.
                            // If eventList == null or typeof(eventList) == ReadOnlyCollection<ComputeEventBase>, a new event will not be created.
                            commands.Execute(kernel, null, new long[] { dims[0], dims[1] }, null, eventList);

                            // Read back the results. If the command-queue has out-of-order execution enabled (default is off), ReadFromBuffer 
                            // will not execute until any previous events in eventList (in our case only eventList[0]) are marked as complete 
                            // by OpenCL. By default the command-queue will execute the commands in the same order as they are issued from the host.
                            // eventList will contain two events after this method returns.
                            commands.ReadFromBuffer(c, ref arrC, false, eventList);

                            // A blocking "ReadFromBuffer" (if 3rd argument is true) will wait for itself and any previous commands
                            // in the command queue or eventList to finish execution. Otherwise an explicit wait for all the opencl commands 
                            // to finish has to be issued before "arrC" can be used. 
                            // This explicit synchronization can be achieved in two ways:
                            // 1) Wait for the events in the list to finish,
                            //eventList.Wait();
                            //}
                            // 2) Or simply use
                            commands.Finish();

                            slicenr++;
                        }

                    }
                }
            }

我的内核代码：

const sampler_t smp = CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP |   CLK_NORMALIZED_COORDS_FALSE;
kernel void LowerThreshold(
    read_only image2d_t image,
    global int* brickSize,
    uint wordsPerRow,
    uint wordsPerPlane,
    int slicenr,
    int lower,
    int upper,
    global write_only uint* c )
{

    int4 coord = (int4)(get_global_id(0),get_global_id(1),slicenr,1);
    uint4 pixel = read_imageui(image, smp, coord.xy);

    uchar BinaryWordShift = 5;
    int BinaryWordBits = 32;
    if (
            (0 <= coord.x) &&
            (0 <= coord.y) &&
            (0 <= coord.z) &&
            (coord.x < brickSize[0]) &&
            (coord.y < brickSize[1]) &&
            (coord.z < brickSize[2])
    ) {
        global uint* word =
            c + 1 + (coord.z * wordsPerPlane) +
            (coord.y * wordsPerRow) +
            (coord.x >> BinaryWordShift);

        uint mask = (uint)(0x1 << ((BinaryWordBits - 1) - (uchar)(coord.x & 0x1f)));
        //if (pixel.w > lower && pixel.w < upper) {
        if (slicenr < 15) {
            *word |= mask;
        } else {
            *word &= ~mask;
        }
    }
}

Answer 1

两个问题：

您已将“c”声明为“write_only”但仍使用“| =”和“＆amp; =”运算符，这些运算符为read-modify-write
正如其他海报所提到的，如果两个工作项正在访问同一个单词，那么read-modify-write之间会存在导致错误的竞争条件。原子操作比非原子操作慢得多，因此尽管可能，但不建议使用。

我建议你的输出大8倍，使用字节而不是位。这将使您的输出只写，并且还会消除争用，从而消除竞争条件。

或（如果数据紧凑性或格式很重要）每个工作项一次处理8个元素，并将复合8位输出写为单个字节。这将是只写的，没有争用，并且仍然具有数据紧凑性。

GPU内核中的指针和位运算符

1 个答案: