Question

我有一些与此类似的代码：

Wave.Points.Add(new Point(X, Y));
// remove the first value
Wave.Points.RemoveAt(0);

我想将其转移到OpenCL内核，我将如何以最有效的方式复制队列的功能。

目前，我已经使用固定大小的数组实现了它，该数组监视队列中元素的头部和数量。我想知道是否有更优雅的解决方案。

由于

Answer 1

您应该检查SYCL：https://www.khronos.org/sycl其中C ++模板函数可以包含主机和设备代码，以构建使用OpenCL加速的复杂算法。如果你不能使用它，如果你甚至没有设备用于opencl 2.0：

版本1.2没有任何等效内容。但在评论中，你说：

不确定我理解你的意思，但这需要与GPU上的所有线程分开

然后就不需要进行线程间通信了，应该很容易实现一个使用循环缓冲区进行缓存+内存+计算效率的FIFO，只是不要溢出它（最多64个元素）并且不要＃39; t下溢（弹出比弹出更多但很容易实现边界检查（性能损失））：

推

   bool push(__private uint * stack, uint value)
    {
        // pushing from bot, so you can pop it from top later (FIFO)
        // circular buffer for top performance
        uint bufLen=64;
        // zeroth element is counter for newest added element
        // first element is oldest element


        // circular buffer 
        uint nextIndex=(stack[0]%bufLen+2); // +2 because of top-bot headers

        // if overflows, it overwrites oldest elements one by one
        stack[nextIndex]=value;


        // if overflows, it still increments 
        stack[0]++;

        // simple and fast
        return true;
    }

检查是否为空

        bool empty(__private uint * stack)
        {
            // tricky if you overflow both
            return (stack[0]==stack[1]);
        }

前值

        uint front(__private uint * stack)
        {
            uint bufLen=64;

            // oldest element value (top)
            uint ptr=stack[1]%bufLen+2; // circular adr + 2 header


            return stack[ptr];
        }

弹出

        uint pop(__private uint * stack)
        {
               uint bufLen=64;
               uint ptr=stack[1]%bufLen+2;
               // pop from top (oldest)
               uint returnValue=stack[ptr];
               stack[ptr]=0;

               // this will be new top ctr for ptr
               stack[1]++;

               // if underflows, gets garbage, don't underflow

               return returnValue;
        }

用于基准测试的示例内核：

        __kernel void queue0(__global uint * heap)
        {
            int id=get_global_id(0);
            __private uint q[100];
            for(int i=0;i<256;i++)
                q[i]=0;

            for(int i=0;i<55;i++)    
                push(q,i);

            for(int i=0;i<40;i++)    
                pop(q);

            for(int i=0;i<20;i++)    
                push(q,i);

            for(int i=0;i<35;i++)    
                pop(q);

            for(int i=0;i<35;i++)    
            {
                push(q,i);
                pop(q);
            }
            push(q,'h');
            push(q,'e');
            push(q,'l');
            push(q,'l');
            push(q,'o');
            push(q,' ');
            push(q,'w');
            push(q,'o');
            push(q,'r');
            push(q,'l');
            push(q,'d');
            for(int i=0;i<256;i++)
                heap[id*256+i]=q[i];
        }

缓冲区的输出（显示线程id = 0计算结果）

121 110 0      0 0 0    0 0 0    0 0 0 // 121 pushes total, 110 pops total

0 0 0    0 0 0    0 0 0    0 0 0

0 0 0    0 0 0    0 0 0    0 0 0

0 0 0    0 0 0    0 0 0    0 0 0

104 101 108      108 111 32     119 111 114    108 100 0 
// hello world

超过200k推送+弹出6.35毫秒（运行内核，1024个线程，每个工作256个元素，但只使用64 + 2个元素用于循环缓冲），用于1通道1600MHz ddr3 RAM和12位计算机的英特尔高清显卡400单位（总共96个核心@ 600 MHz）。

如果使用64 x 4元素循环缓冲区构造64元素循环缓冲区，则也可以在堆栈的顶部和底部之间添加元素！

是否有一个OpenCL std :: queue等价物

1 个答案: