Armv8汇编程序优化

时间:2016-11-03 06:15:13

标签: c++ arm computer-vision inline-assembly armasm

你好我在Raspberry Pi 3上做了我的第一个汇编程序实现。我想问你如何改进算法。它基本上应该做的是: 在320x240 uint8_t数组中,它分析每个点并从中创建两个位掩码。通过将中心点与半径为3的圆上的如此舍入的像素进行比较来创建位掩码。如果该圆上的像素小于中心减去阈值,则regLO掩码得到1否则为0.如果该圆上的像素大于中心加上阈值,regHI得到1,否则为0.每次比较后,regHi和regLO都会移位1。这样我们最终得到一个具有越来越高像素的位掩码。 该算法构建了FAST-9算法的准备。

[编辑]: 我知道c ++ / c代码与我的汇编程序代码相似(实际上它在c ++中需要19ms,在汇编程序中需要17ms)。但我正在学习汇编程序。我也知道SIMD更快但我想先学习基本的汇编程序。

[EDIT2]: 添加了c ++和SIMD实现

#include <iostream>
#include <stdint.h>
#include <chrono>
#include <ctime>
using namespace std;
#define HT 240
#define WT 320
#define WTHT 76800
#define WT3 960
typedef std::chrono::high_resolution_clock clock2;
typedef std::chrono::microseconds res;

int main() {

    clock2::time_point t1, t2 ,t3;  
uint32_t result = 0;
volatile uint8_t arr[WTHT];
for(int i=0;i<WT*HT;i++){
    arr[i]=9;   
}
arr[3]=7;
arr[4]=10;
arr[3+3*WT]=17;
t1 = clock2::now();
volatile uint8_t *pnt;
for(int iy=WT3;iy<(WTHT)-WT3;iy+=WT){
    pnt=&arr[iy+2];
    for(int ix=3;ix<WT-3;ix++){
    uint32_t resultlo = 0;
    uint32_t resulthi = 0;
    ++pnt;
    asm volatile( 
        //loading the center value in r0
        "ldrb r0, [%[in], #963]\n\t"
                //r0 forms the lower boundary
        "sub r0, r0,#8\n\t"
        //r2 forms the higher boundary
        "add r2,r0,#16\n\t"

        //Load of first pixel 3 pixel above center in r1
        "ldrb r1, [%[in], #3]\n\t"
        //compare r1 to lower boundary
            "cmp r1,r0\n\t"
        //thumb it instruction add one to regLo if lower
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end1 \n\t"
        //compare r1 to higher boundary 
        "cmp r1,r2\n\t"
        //thumb IT instruction add one to regHi if higher
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end1: \n\t"
        //shift both bitmasks by one
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        //analyze next pixel
        "ldrb r1, [%[in], #4]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end2 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end2: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #325]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end3 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end3: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #646]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end4 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end4: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #966]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end5 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end5: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1286]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end6 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end6: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1605]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end7 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end7: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1924]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end8 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end8: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1923]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end9 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end9: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1922]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end10 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end10: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1601]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end11 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end11: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1280]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end12 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end12: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #960]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end13 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end13: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #640]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end14 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end14: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #321]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end15 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end15: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #2]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end16 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end16: \n\t"
            :[out]"=r"(resultlo),[outhi]"=r"(resulthi): [in]"r" (pnt):"r0","r1","r2");
}
}
t2 = clock2::now();
std::cout << "Elapsed time is "
    << std::chrono::duration_cast<res>(t2-t1).count()<< "   microseconds.\n";


return 0;
}

并[c ++]

uint64_t r1=0;
    uint64_t r2=0;
    uint32_t result2=0;
    uint32_t result3=0;
    {
    for(int iy=WT3;iy<((WTHT)-WT3);iy+=WT){
    pnt=&arr[iy];
        for(int ix=3;ix<(WT-3);ix++){
            result2=0;
            result3=0;

            //get center point value
            const uint8_t c=*(pnt+963);
            //set lower bound
            const uint8_t l=c-8;
            //set upper bound
            const uint8_t h=c+8;
            //get first pixel value
            uint8_t p=*(pnt+3);
            //is it above uper bound 
            if(p>h){
                ++result2;              
            //or maybe below lower bound
            } else if(p<l){
                ++result3;
            }
            //shift both
            result2=result2<<1;
            result3=result3<<1;
            //set to next pixel value
            p=*(pnt+4);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+325);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;

            p=*(pnt+646);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+966);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1286);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1605);

            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1924);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1923);

            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1922);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1601);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1280);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+960);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+640);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+321);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+2);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            //set pointer to next pixel
            ++pnt;              
            //prevent code part for beeing optimized out
            r1+=result2;
            r2+=result3;
        }
    }
}

[SIMD或NEON instrincts]

for(int iy=WT3;iy<((WTHT)-WT3);iy+=WT){
        pnt=&arr[iy-WT3];
        for(int ix=3;ix<(WT-3);++ix){
            //set center value
             uint8_t c1=*(pnt+963);
            //set lower bound
             uint8_t l1=c1-8;
            //set uper bound
             uint8_t h1=c1+8;
            //load all values from circle in one array
            uint8_t ps1[16]={*(pnt+3),*(pnt+4),*(pnt+325),*(pnt+646),
                    *(pnt+966),*(pnt+1286),*(pnt+1605),*(pnt+1924),
                    *(pnt+1923),*(pnt+1922),*(pnt+1601),*(pnt+1280),
                    *(pnt+960),*(pnt+640),*(pnt+321),*(pnt+2)};
            //load this array in neon register  
            uint8x16_t t1=vld1q_u8(ps1);

            //Load one uint8x16 vector with same value (higher bound)
            uint8x16_t hl1 =vld1q_dup_u8(&h1);
            //Load one uint8x16 vector with same value (lower bound)
            uint8x16_t ll1 =vld1q_dup_u8(&l1);
            //Vector compare less-than
            uint8x16_t rl=vcltq_u8(t1,ll1);
            //Vector compare greater-than
            uint8x16_t rh=vcgtq_u8(t1,hl1);
            ++pnt;
        }
    }

如果您能指出我可以对该代码进行一些优化以使其更快地运行,那将是非常好的

执行时间 汇编程序是17ms 带有O2标志的c / c ++:19ms SIMD:44ms

0 个答案:

没有答案