Question

我想通过使用arm neon library来实现一篇论文，我在ARM皮层a8上发现了大约5ms的ORB特征计算。但我已经在使用FAST特征检测进行了挣扎。因此，我尝试实施的论文可以找到here。首先，我不确定明亮和黑暗的约束。因此，根据我的理解，如果中心像素周围有9个较暗或9个较亮的像素，则必须检查FAST。所以我检查两者。但是现在我遇到的问题是，如果没有最终的移位操作来计算，如果它是一个角，那么我的实现平均已经长了3倍，然后是opencv对整个进度的平均计算。所以这是我的代码到目前为止可能有人可以指出我可以做的一些优化。

        //detect with opncv
        Clock::time_point t0 = Clock::now();
        detectors[y]->detect(img, ocv_kps);
        Clock::time_point t1 = Clock::now();

        vector<Point2f> my_kps;
        //threshhold for FAST
        const uchar th = 8;

        int b_cnt = 0;
        int d_cnt = 0;
        //array with four possible corners to be processed in parallel
        uint32_t id_arr[4];
        uint32_t ib_arr[4];

        Clock::time_point t01 = Clock::now();
        for (int i = 3; i < img.rows - 3; i++) {
            //get pointer to seven Image rows three above and three below center and center itself
            const uchar* Mt3 = img.ptr<uchar>(i - 3);
            const uchar* Mt2 = img.ptr<uchar>(i - 2);
            const uchar* Mt1 = img.ptr<uchar>(i - 1);
            const uchar* Mc = img.ptr<uchar>(i);
            const uchar* Mb1 = img.ptr<uchar>(i + 1);
            const uchar* Mb2 = img.ptr<uchar>(i + 2);
            const uchar* Mb3 = img.ptr<uchar>(i + 3);
            for (int j = 3; j < img.cols - 3; j++) {
                const uchar j3 = j + 3;
                const uchar j2 = j + 2;
                const uchar j1 = j + 1;
                const uchar jn3 = j - 3;
                const uchar jn2 = j - 2;
                const uchar jn1 = j - 1;

                 //image values for center left right top and bottom intensity of pixel
                const uchar c = Mc[j];
                const uchar l = Mc[jn3];
                const uchar r = Mc[j3];
                const uchar t = Mt3[j];
                const uchar b = Mb3[j];

                //threshold for bright FAST constraint
                const uchar thb = c + th;

                //bools for bright constraint
                const bool cbt = t > thb;
                const bool cbb = b > thb;
                const bool cbl = l > thb;
                const bool cbr = r > thb;

                 uchar mt3;
                 uchar mt3n;
                 uchar mt2;
                 uchar mt2n;
                 uchar mt1;
                 uchar mt1n;
                 uchar mb3;
                 uchar mb3n;
                 uchar mb2;
                 uchar mb2n;
                 uchar mb1;
                 uchar mb1n;
                bool bc = false;
                //pre test do we have at least two points which fulfill bright constraint
                if ((cbl && cbt) || (cbt && cbr) || (cbr && cbb)
                        || (cbb && cbl)) {
                    bc = true;
                    //get rest of image intensity values of circle
                    mt3 = Mt3[j1];
                    mt3n = Mt3[jn1];
                    mt2 = Mt2[j2];
                    mt2n = Mt2[jn2];
                    mt1 = Mt1[j3];
                    mt1n = Mt1[jn3];
                    mb3 = Mb3[j1];
                    mb3n = Mb3[jn1];
                    mb2 = Mb2[j2];
                    mb2n = Mb2[jn2];
                    mb1 = Mb1[j3];
                    mb1n = Mb1[jn3];

                    //values for bright constrain
                    ib_arr[b_cnt] = cbt | ((mt3) > thb) << 1
                            | ((mt2) > thb) << 2 | ((mt1) > thb) << 3
                            | (cbr << 4) | ((mb1) > thb) << 5
                            | ((mb2) > thb) << 6 | ((mb3) > thb) << 7
                            | cbb << 8 | ((mb3n) > thb) << 9
                            | ((mb2n) > thb) << 10 | ((mb1n) > thb) << 11
                            | (cbl) << 12 | ((mt1n) > thb) << 13
                            | ((mt2n) > thb) << 14 | ((mt3n) > thb) << 15
                            | (cbt) << 16 | ((mt3) > thb) << 17
                            | ((mt2) > thb) << 18 | ((mt1) > thb) << 19
                            | (cbr) << 20 | ((mb1) > thb) << 21
                            | ((mb2) > thb) << 22 | ((mb3) > thb) << 23;
                    b_cnt++;
                    //if we have four possible corners in array check if they are corners
                    if (b_cnt == 4) {
                        uint32x2x4_t IB = vld4_u32(ib_arr);
                        /*
                         * here the actual shift operation would take place
                         */
                        b_cnt = 0;
                    }
                }

                //threshold for dark constraint
                const uchar thd = c - th;
                //bools for dark constraint
                const bool cdl = l < thd;
                const bool cdr = r < thd;
                const bool cdt = t < thd;
                const bool cdb = b < thd;
                //pre test do we have at least two points which fulfill dark constraint
                if ((cdl && cdt) || (cdt && cdr) || (cdr && cdb)
                        || (cdb && cdl)) {
                    //if bright pre test failed intensity values are not initialised
                    if (!bc) {
                        //get rest of image intensity values of circle
                        mt3 = Mt3[j1];
                        mt3n = Mt3[jn1];
                        mt2 = Mt2[j2];
                        mt2n = Mt2[jn2];
                        mt1 = Mt1[j3];
                        mt1n = Mt1[jn3];
                        mb3 = Mb3[j1];
                        mb3n = Mb3[jn1];
                        mb2 = Mb2[j2];
                        mb2n = Mb2[jn2];
                        mb1 = Mb1[j3];
                        mb1n = Mb1[jn3];
                    }
                    //bool values for dark constrain
                    id_arr[d_cnt] = cdt | ((mt3) < thd) << 1
                            | ((mt2) < thd) << 2 | ((mt1) < thd) << 3
                            | (cdr) << 4 | ((mb1) < thd) << 5
                            | ((mb2) < thd) << 6 | ((mb3) < thd) << 7
                            | (cdb) << 8 | ((mb3n) < thd) << 9
                            | ((mb2n) < thd) << 10 | ((mb1n) < thd) << 11
                            | (cdl) << 12 | ((mt1n) < thd) << 13
                            | ((mt2n) < thd) << 14 | ((mt3n) < thd) << 15
                            | (cdt) << 16 | ((mt3) < thd) << 17
                            | ((mt2) < thd) << 18 | ((mt1) < thd) << 19
                            | (cdr) << 20 | ((mb1) < thd) << 21
                            | ((mb2) < thd) << 22 | ((mb3) < thd) << 23;
                    d_cnt++;
                    //if we have four possible corners in array check if they are corners
                    if (d_cnt == 4) {
                        uint32x2x4_t IA = vld4_u32(id_arr);
                        /*
                         * here the actual shift operation would take place
                         */
                        d_cnt = 0;
                    }
                    int h = cdt;

                }
            }
        }
        Clock::time_point t11 = Clock::now();
        cout << "my algorithm found " << my_kps.size()
                << " and ocv found " << ocv_kps.size() <<  endl;

        microseconds ms1 = std::chrono::duration_cast < microseconds
                > (t1 - t0);
        microseconds ms2 = std::chrono::duration_cast < microseconds
                > (t11 - t01);

        rs.Push((double) ms2.count());
        cout << "my algorithm duration " << ms2.count()
                << " and ocv duration is " << ms1.count()  << endl;

Answer 1

我有一个ORB提取器，它在树莓派上运行速度为30fps。

https://github.com/0xfaded/pislam

优化实际上是一种黑色艺术，更糟糕的是，ARM从未发布过针对a53的优化指南。我们最好的是a57，它可能有类似的NEON单元。

我无法在这里提供完整的答案，但我会分享一些关于我的流程的内容。

我的FAST提取器的第一部分加载测试像素环并将它们转换为16位向量，就像你的代码一样。我没有直接编写asm，而是使用了gcc内在函数。尽管如此，我确保gcc：

没有将任何寄存器泄漏到堆栈
为每次比较发出最少数量的指令

你会注意到第一次比较没有用掩码隔离它的位，而掩码本来是0x80。这释放了一个寄存器，否则它将保持一个常量，它给gcc足够的摆动空间，不会泄漏寄存器。

您还会注意到一些相当可怕的内在用法：

  d0 = vbslq_u8(vdupq_n_u8(0x40u), vcgeq_u8(test, dark), d0);
  l0 = vbslq_u8(vdupq_n_u8(0x40u), vcleq_u8(test, light), l0);

这相当于

  d0 |= test >= dark & 0x40;
  l0 |= test >= light & 0x40;

Gcc很乐意编译后者，但发出1.5倍的指令。

第二部分是对16位向量进行FAST-9测试。下面编写了16条指令，但是我花了将近一个月的时间来思考它。

  uint8x16_t t0 = vtstq_u8(d0, d1);
  uint8x16_t t1 = vtstq_u8(d0, d1);

  t0 = vbslq_u8(t0, l0, d0);
  t1 = vbslq_u8(t1, l1, d1);

  uint8x16_t cntLo = vclzq_u8(t0);
  uint8x16_t testLo = t1 << (cntLo - 1);
  asm("vceq.u8  %q0, %q0, #0" : [val] "+w" (testLo));

  uint8x16_t cntHi = vclzq_u8(t1);
  uint8x16_t testHi = t0 << (cntHi - 1);
  asm("vceq.u8  %q0, %q0, #0" : [val] "+w" (testHi));

  uint8x16_t result = (cntLo & testLo) | (cntHi & testHi);
  result = vtstq_u8(result, result);

令人讨厌的是，gcc不会将testLo == 0编译为vceq.u8 %q0, %q0, #0，这是一个用于与常数零进行比较的特殊指令。我手动插入这些，这削减了另外几个指令。

希望能提供一些见解。 Fast.h

Answer 2

所以在Arm Assembler中挖了一下之后。我提出了一个代码，它在Arm上的运行速度至少快2倍，然后是Fast9的内置OpenCv实现。您可以查看GitHub上的代码。对于优化它的任何重新定位，我感到非常高兴。在我的Raspberry Pi 3上它需要：我的算法是1000ms 2000ms for OpenCv

在320x240灰度图像上。

针对ARM的优化FAST计算

2 个答案: