tinyAVR:最着名的8位和16位因子乘法程序?

时间:2015-04-29 05:28:01

标签: algorithm avr multiplication 8-bit attiny

“比avr200b.asm更快”?来自avr200b.asmmpy8u - 来自Atmel的AVR系列处理器的MUL例程并未执行任何mpy16u指令似乎非常通用,但MUL看起来很邋for结果字节为16次而不是8. Antonio使用64个周期的最坏情况提出fast 16×16→16 unsigned multiplication,不包括呼叫/返回开销。
我随意建议优化目标最坏情况周期计数字数(RAM和闪存),寄存器使用预期周期数按优先级降低的顺序 (减少核心 AVR(“单个数字”-ATtiny,10/20/40),包括时间差异,我建议忽略。)

(注意:不要将此处的任何主张视为理所当然,至少在没有独立肯定的情况下。)

对于没有<?php $subject="This is a test message"; $message="This is a Body Section now.....! :)"; $to="someaddress@somedomain.com"; // starting outlook com_load_typelib("outlook.application"); if (!defined("olMailItem")) {define("olMailItem",0);} $outlook_Obj = new COM("outlook.application") or die("Unable to start Outlook"); //just to check you are connected. echo "Loaded MS Outlook, version {$outlook_Obj->Version}\n"; $oMsg = $outlook_Obj->CreateItem(olMailItem); $oMsg->Recipients->Add($to); $oMsg->Subject=$subject; $oMsg->Body=$message; $oMsg->Save(); $oMsg->Send(); ?> 的AVR,目前已知的最佳8×8→8 / 16,16×16→16/32和16×8→16/24位乘法程序是什么?

9 个答案:

答案 0 :(得分:0)

这是一个正方形查找的镜头,首先是常见的预赛:

;.def   ZL  = r30
;.def   ZH  = r31

; register assignment trying to follow "avr200b.asm";
;  gcc would use 25 down in stead of 16 up
; (and same registers for parameters and result, requiring
;  two or one movw for another four or two bytes & cycles)
.def    a0  = r16   ; factor low byte
.def    a1  = r17
.def    a   = r16   ; 8-bit factor
.def    b   = r17   ; 8-bit factor ; or r18, rather?
.def    b0  = r18   ; factor low byte
.def    b1  = r19
.def    p0  = r20   ; product low byte
.def    p1  = r21
.def    p2  = r22
.def    p3  = r23
.def    sq  = r25   ; tmp, might have used r0
;                          & parameterless LPM
.def    s0  = r0
.def    s1  = r24

; "squares table" shall be a 1 KByte table of squares of
;  9-bit natural numbers, divided by 4;
;  aligned on a 1K border in program memory,
;  organised as 512 lower bytes followed by the high bytes.

; the idea is to exploit
;  p = a * b = Squares[a+b] - Squares[a-b]

; assembly lines are marked up with cycle count and
; (latest) start cycle in block.
;  If first line in code block, the (latest)
;  block start cycle follows;
;  else if last line, the (max) block cycle total

8×8→16位:

;**********************************************************
;*
;* "mpy8T" - 8x8->16 Bit Unsigned Multiplication
;*                                using table lookup
;* (mpy8u: 34 words/cycles (avr200b.asm))
;* Multiplies two 8-bit register values a and b.
;* The result is placed in p1:p0.
;*  
;* Number of words  : 17 + 512(table)=553 + return
;* Number of cycles : 25 + return (table coming preset ...)
;* Low  registers used  : None
;* High registers used  : 5+2 (a, b, p1:p0, sq;
;*                             + Z(r31:r30))    
;*
;*********************************************************
mpy8T:
; p = a * b = Squares[a+b] - Squares[a-b]
    ldi     ZH, 2       ; 1 0   0   squares table / 2
    mov     ZL, a       ; 1 1
    add     ZL, b       ; 1 2       a+b
    rol     ZH          ; 1 3       9 bit offset
    lpm     p0, Z       ; 3 4       a+bl            1
    sbr     ZH, 1       ; 1 7
    lpm     p1, Z       ; 1 8   11  a+bh            2*

    ldi     ZH, 4       ; 1 0   11  squares table

    mov     ZL, a       ; 1 0   12
    sub     ZL, b       ; 1 1       a-b
    brcc    pos         ; 1 2
    neg     ZL          ; 1 3
pos:
    lpm     sq, Z       ; 3 4       a-bl            3
    sub     p0, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       (ldi ZH, 6)
    lpm     sq, Z       ; 3 9       a-bh            4*
    sbc     p1, sq      ; 1 12  13

    ret                 ; 3 25

16×16→16/32位:

;**********************************************************
;*
;* "mpy16T" - 16x16->32 Bit Unsigned Multiplication
;*                                   using table lookup
;*
;* Multiplies two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p3:p2:p1:p0.
;*  
;* Number of words  :  74 + 512(table) = 553
;*                        + return (+ push/pop)
;* Number of cycles : 106 + return (+ push/pop)
;*                         (table coming preset ...)
;* (avr200b.asm mpy16u improved: 100, as-is: 116)
;* Low  registers used  : 2 (s0, zero) (could use r26&r27)
;* High registers used  : 10+2 (a1:a0, b1:b0, p3:p2:p1:p0,
;*                              sq, s1; + Z(r31:r30))   
;*
;*********************************************************
mpy16T:
.def    zero = r2 ; (gcc's choice of r1 is funny given mul)
;   push    zero
    clr     zero        ; 1 0   0
; initialise p1:p0
    ldi     ZH, 2       ; 1 0   1   squares table / 2
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    lpm     p0, Z       ; 3 4       a0+b0l          1
    sbr     ZH, 1       ; 1 7       squares table 2nd half
    lpm     p1, Z       ; 3 8   11  a0+b0h          2
; initialise p3:p2
    ldi     ZH, 2       ; 1 0   12
    mov     ZL, a1      ; 1 1
    add     ZL, b1      ; 1 2       a1+b1
    rol     ZH          ; 1 3
    lpm     p2, Z       ; 3 4       a1+b1l          3
    sbr     ZH, 1       ; 1 7
    lpm     p3, Z       ; 3 8   11  a1+b1h          4

; all differences are 8 bit abs: save index carry handling
    ldi     ZH, 6       ; 1 0   23  squares table 2nd half
; do highest square first for carry handling
    mov     ZL, a1      ; 1 0   24
    sub     ZL, b1      ; 1 1       a1-b1
    brcc    pos11       ; 1 2
    neg     ZL          ; 1 3
pos11:
    lpm     s1, Z       ; 3 4       a1-b1h          5
    ldi     ZH, 4       ; 1 7       squares table 1st half
    lpm     s0, Z       ; 3 8   11  a1-b1l          6

    mov     ZL, a0      ; 1 0   35
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    pos00       ; 1 2
    neg     ZL          ; 1 3
pos00:
    lpm     sq, Z       ; 3 4       a0-b0l          7
    sub     p0, sq      ; 1 7
    ldi     ZH, 6       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a0-b0h          8
    sbc     p1, sq      ; 1 12
    sbc     p2, s0      ; 1 13
    sbc     p3, s1      ; 1 14  15

    mov     ZL, a1      ; 1 0   50
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    pos10       ; 1 2
    neg     ZL          ; 1 3
pos10:
    lpm     s1, Z       ; 3 4       a1-b0h          9
    ldi     ZH, 4       ; 1 7       squares table 1st half
    lpm     sq, Z       ; 3 8       a1-b0h          10
    sub     p1, sq      ; 1 11
    sbc     p2, s1      ; 1 12
    sbc     p3, zero    ; 1 13  14

    mov     ZL, a0      ; 1 0   64
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    pos01       ; 1 2
    neg     ZL          ; 1 3
pos01:
    lpm     sq, Z       ; 3 4       a0-b1l          11
    sub     p1, sq      ; 1 7
    ldi     ZH, 6       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a0-b1h          12
    sbc     p2, sq      ; 1 12
    sbc     p3, zero    ; 1 13  14

    ldi     ZH, 2       ; 1 0   78
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a1+b0l          13
    add     p1, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a1+b0h          14
    adc     p2, sq      ; 1 12
    adc     p3, zero    ; 1 13  14

    ldi     ZH, 2       ; 1 0   92
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a0+b1l          15
    add     p1, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       squares table 2nd half
    lpm     sq, Z       ; 3 9       a0+b1h          16
    adc     p2, sq      ; 1 12
    adc     p3, zero    ; 1 13  14

;   pop     zero
    ret                 ;       106

16×16→16位:

;*********************************************************
;*
;* "mpy16T16" - 16x16->16 Bit Unsigned Multiplication
;*                                     using table lookup
;*
;* Multiplies  two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*  
;* Number of words  :  41 + 512(table)=553 + return
;* Number of cycles :<=57 + return
;* Low  registers used  : None
;* High registers used  : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;*                             + Z(r31:r30))    
;*
;*********************************************************
mpy16T16:
    ldi     ZH, 2       ; 1 0   0   squares table / 2
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    lpm     p0, Z       ; 3 4       a0+b0l          1
    sbr     ZH, 1       ; 1 7
    lpm     p1, Z       ; 1 8   11  a0+b0h          2*

    ldi     ZH, 4       ; 1 0   11  squares table

    mov     ZL, a1      ; 1 0   12
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    noNeg10     ; 1 2
    neg     ZL          ; 1 3
noNeg10:
    lpm     sq, Z       ; 3 4       a1-b0l          3
    sub     p1, sq      ; 1 7   8

    mov     ZL, a0      ; 1 0   20
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    noNeg01     ; 1 2
    neg     ZL          ; 1 3
noNeg01:
    lpm     sq, Z       ; 3 4       a0-b1l          4
    sub     p1, sq      ; 1 7   8

    mov     ZL, a0      ; 1 0   28
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    noNeg00     ; 1 2
    neg     ZL          ; 1 3
noNeg00:
    lpm     sq, Z       ; 3 4       a0-b0l          5
    sub     p0, sq      ; 1 7
    sbr     ZH, 1       ; 1 8       (ldi ZH, 6)
    lpm     sq, Z       ; 3 9       a0-b0h          6*
    sbc     p1, sq      ; 1 12  13

    ldi     ZH, 2       ; 1 0   41
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a1+b0l          7
    add     p1, sq      ; 1 7   8

    ldi     ZH, 2       ; 1 0   49
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    lpm     sq, Z       ; 3 4       a0+b1l          8
    add     p1, sq      ; 1 7   8

    ret                 ;       57

答案 1 :(得分:0)

无符号8×8→8,左移位因子和加,展开。

; factors a0, b0 and product p0
.MACRO step8
    sbrc    b0, @0
    add     p0, a0
    add     a0, a0  ; +3
.EndM
; 8x8->8 bit unsigned multiply, factor shift, unrolled.
; 24 cycles & words + return (caveat emptor)
mpy8U8:
    clr     p0      ; 1
    step8   0       ; 4
    step8   1       ; 7
    step8   2       ; 10
    step8   3       ; 13
    step8   4       ; 16
    step8   5       ; 19
    step8   6       ; 22
    sbrc    b0, 7   ; 23/24
    add     p0, a0  ; 24
    ret
# endif

无符号16×16→16,左移位因子和加,展开。
现在显示宏;三种口味:简单和 - 好吧,不是。

; mpy16A16: 16x16->16 bit unsigned multiply, shifting
; one factor bit-by-bit, testing same bits in
; different bytes of the other; idea due to Antonio
; (http://stackoverflow.com/users/2436175/antonio)
; in <http://stackoverflow.com/a/29812254/3789665>
; <= 62/61/60 cycles, 62/87/155 words + return (caveat emptor)
; (57.5, 56.75 and 55.75 expected _for a uniform distribution_)
; "middle" variant assembled with neither Plain nor Need4Speed
; defined, shown without separate "timing comments"
; ("without speed345", just add one to the Need4Speed timings)

; some macros using factors a1:a0, b1:b0 and product p1:p0
.MACRO addA     ;   adds (weighted) factor "a" into product
    add     p0, a0  ; +1
    adc     p1, a1  ; +2
.EndM
.MACRO doubleA  ;   adds (shifts/weights) factor "a"
    add     a0, a0  ; +1
    adc     a1, a1  ; +2
.EndM
.MACRO doHighB  ;   "does" bit in b1, bit number as a parameter
    sbrc    b1, @0  ; 1
    add     p1, a0  ; 2
.EndM
; "do" 2 bits, bit numbers in b1 and b0 as parameters
.MACRO stepS
    bst     b0, @1  ; +1
    brtc    noadd   ; +2/3
    addA            ; +4
noadd:  ;   gets decorated; almost as neat as "numeric labels"
    doHighB @0      ; +6
    doubleA         ; +8
.EndM
.MACRO step16; "do" 2 bits, bit# in b1 and b0 as a parameter
    stepS   @0, @0
.EndM
; empty if no Need4Speed; speed3do45, really
.MACRO speed345
#if Need4Speed
    brhc    noadd   ; 1/2
; kkbb1     (b starts with two Known bit, bit 3 just checked)
    addA            ; 3
noadd:              ;   2/3
    doHighB 3       ; 5
    doubleA         ; 7
    stepS   4       ; 15
    stepS   5       ; 23
#endif
.EndM

# if !Plain
; showing up here due to limited branch offset
no67:               ;       29
; 00
    speed345        ; 23
    doHighB 6       ; 2     54
    sbrs    b1, 7   ; 1/2       doHighB 7 with early out
    ret             ;       55  last to start, first to finish
    add     a0, a0  ; 3
    add     p1, a0  ; 4     58
    ret

no7:                ;       27
; 0
    brpl    no67    ; 1/2   29
; 01
    speed345        ; 23    51
    addA            ; 2
    doHighB 6       ; 4     55
    sbrs    b1, 7   ; 1/2       doHighB 7 with early out
    ret             ;       56
    add     a0, a0  ; 3
    add     p1, a0  ; 4     59
    ret
# endif

在单独的代码块中尝试轻松浏览:

mpy16A16:
    clr     p0      ; 1
    clr     p1      ; 2 ; p1:p0 = 0

    sbrc    b0, 0   ; 3
; "fast-laning the trailing zeroes case" isn't as attractive as
; in a shift pp variant: no gain from avoiding "shift pp", here
    movw    p0, a0  ; 4 ; p1:p0 "+=" a1:a0
    doHighB 0       ; 6
    add     a0, a0  ; 7 ; breq a0zero for early out added 1
                            ; (+you'd have to handle the carry)
    adc     a1, a1  ; 8 ; breq a1zero for early out added 1
                    ;       8
    step16  1       ; +8
    step16  2       ; +8    24
# if !Need4Speed
    step16  3       ; +8
    step16  4       ; +8
    step16  5       ; +8    48
#  if Plain
    step16  6       ; +8
                    ;       56
    doHighB 7       ; +2
    sbrs    b0, 7   ; +3/4
    ret             ;       59  top for 1bbbbbbb01bbbbbb ;-)
    addA            ; +6    62 _worst case_!
    ret
#  endif
# endif
# if !Plain
    lsl     b0      ; 1     24  make bit 7, 6(&3) "branchable"
; takes one cycle, but each conditional branch takes one less
; than skip-over-rjmp or bst b0,i brtc - netting 1 cycle off
; (at the cost of multiplying code)
    brcc    no7     ; 2/3   27
; 1
    brpl    no6but7 ; 3/4   28
; 11
    speed345        ; 23    50
    addA            ; 2
    doHighB 6       ; 2     54
    doubleA         ; 2
    addA            ; 2
    doHighB 7       ; 2     60
    ret

no6but7:            ;       28
; 10
    speed345        ; 23    51
    doHighB 6       ; 2     53
    doubleA         ; 2
    addA            ; 2
    doHighB 7       ; 2     59
    ret
#endif
; for an analysis of expected case cycle count, assume half of

; bits b0:5-1 to be zero for 2.5 cycles less. b0:7 off needs
; 1 cycle less with b1:7 set(.5), another 3 if off(.75). b0:6
; off needs 1 cycle less(.5). _for a uniform distribution_, I'd
; _expect 55.75 cycles_. For a distribution with lower numbers
; more likely (upper bits more likely to be 0, remember b0 to
;  be the least significant eight), expect this to be
; _finished in less than 55 cycles_.

下一步是什么? (修改过的Booth, no hold barred (目前支持计算goto )。)

答案 2 :(得分:0)

部分产品查询(对于(位)四重奏/半字节对)

/* multiply accumulating partial products looked up in a table,
 *  "product scanning, decreasing significance",
 *  non-aligned first (non-aligned partial products need
 *                     to be adjusted "bit-wise").
 * Aligned ones are "starred" below for the need to shift one
 *  of the operand nibbles for combination into a table index.
 * 78 cycles, 256 bytes RAM, 83(69) words  (caveat emptor)
 *(88 cycles, 197 words with table in flash)
 */
.equ    L   = 0x0f
.equ    H   = 0xf0
; if indexing is to work by just setting the low byte, this is the
.equ    PTable  = 1 ; only value possibly working with 512 bytes of RAM
.def    a10 = r16
.def    a32 = r17
.def    b10 = r18
.def    b32 = r19
.def    p10 = r20
.def    p32 = r21
.def    p   = r25
.def    t   = r24
.def    a0  = r23
.def    b0  = r22

    rcall   nibbleFiller
mpy16n16:
    mov     b0, b10     ; 1
    andi    b0, L       ; 2
    mov     a0, a10     ; 3
    andi    a0, L       ; 4
    ldi     ZH, PTable  ; 5
/* (values to the left of the gap shown for completeness, only)
         03 
    13*    02*
  23     12  01
33* 22*    11* 00*
  32     21  10
    31*    20*
         30         */
;03
    mov     ZL, b32 ; 1
    andi    ZL, H   ; 2
    or      ZL, a0  ; 3
    ld      p32, Z  ; 5 10
;12
    mov     ZL, b32 ; 1
    andi    ZL, L   ; 2
    mov     t, a10  ; 3
    andi    t, H    ; 4
    or      ZL, t   ; 5
    ld      t, Z    ; 7
    add     p32, t  ; 8 18
;21
    mov     ZL, a32 ; 1
    andi    ZL, L   ; 2
    mov     t, b10  ; 3
    andi    t, H    ; 4
    or      ZL, t   ; 5
    ld      t, Z    ; 7
    add     p32, t  ; 8 26
;30
    mov     ZL, a32 ; 1
    andi    ZL, H   ; 2
    or      ZL, b0  ; 3
    ld      p, Z    ; 5 31

;01
    mov     ZL, b10 ; 1
    andi    ZL, H   ; 2
    or      ZL, a0  ; 3
    ld      p10, Z  ; 5 36
;10
    mov     ZL, a10 ; 1
    andi    ZL, H   ; 2
    or      ZL, b0  ; 3
    ld      t, Z    ; 5
    add     p10, t  ; 6
    adc     p32, p  ; 7 43
; align nibbles
    swap    p10     ; 1
    swap    p32     ; 2
    mov     p, p10  ; 3
; separate nibbles
    andi    p10, H  ; 4
    andi    p32, H  ; 5
    andi    p, L    ; 6 49  postpone nibble addition

;00
    swap    a0      ; 1
    mov     ZL, a0  ; 2
    or      ZL, b0  ; 3
    ld      t, Z    ; 4
    add     p10, t  ; 5
    adc     p32, p  ; 6 55  nibble addition here

;11
    mov     ZL, a10 ; 1
    andi    ZL, H   ; 2
    swap    ZL      ; 3
    mov     t, b10  ; 4
    andi    t, H    ; 5
    or      ZL, t   ; 6
    ld      t, Z    ; 8
    add     p32, t  ; 9 64
;02
    mov     ZL, b32 ; 1
    andi    ZL, L   ; 2
    or      ZL, a0  ; 3
    ld      t, Z    ; 5
    add     p32, t  ; 6 70
;20
    mov     ZL, a32 ; 1
    swap    ZL      ; 2
    andi    ZL, H   ; 3
    or      ZL, b0  ; 4
    ld      t, Z    ; 6
    add     p32, t  ; 8 78 - 256 bytes of RAM, add 10 for flash RAM

    ret

nibbleFiller:
    ldi     ZH, PTable
    clr     ZL
    ser     t
outer:
    inc     t
    clr     p
inner:
    st      Z+, p
    sbrs    ZH, 0
    ret
    mov     a0, ZL
    andi    a0, L
    breq    outer
    add     p, t
    rjmp    inner
    break

答案 3 :(得分:0)

Big-endian修改了Booth-2,展开了。正在进行的工作,ToDo:体面的测试装备,严格盯着关键的路径(58个周期?!),清晰的评论(以及关于那些保留书籍的好主意),移动标签以保存一份通用指令。
16×16→16位(×8位无法获得):

.MACRO doubleP  ;   adds (shifts/weights) (partial) product
    add     p0, p0  ; +1
    adc     p1, p1  ; +2
.EndM

b_010:              ;       9 -1
    sbrs    b1, 7   ;1/2
    rjmp    nob20   ;2/3
    add     p1, a0  ; 3
    add     p1, a0  ; 4
nob20:              ;       13
    doHighB 6       ; 2     15  :-(( 14 if b1:7 off
b20:                ;       15
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b200    ; 7     22
    sbrs    b0, 4   ;7/8
    rjmp    b2010   ; 9     24
b2011:              ;       23
    addA            ; 2
    doubleP         ; 4     2
    doHighB 4       ; 6 29
b41:                ;       29
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b410    ; 7     36
b411:
    sbrc    b0, 2   ;7/8
    rjmp    b4111   ; 9     38
b4110:              ;       37
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     43
b60:                ;       43
    doubleP         ; 2
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     50
b601:               ;       49
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     55 57
    ret

mpy16BEB16:
    lsl     b0      ; 1
    brcc    b_0     ;2/3    3
b_1:                ;       2
    brpl    b_10    ;1/2    4
b_11:               ;       3
    sbrc    b0, 6   ;1/2
    rjmp    b_111   ; 3     6
b_110:              ;       5
    movw    p0, a0  ; 1
    doHighB 7       ; 3
    doubleP         ; 5
    addA            ; 7
    doHighB 6       ; 9     14  :-|
;b20:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b200    ; 7     21
    sbrs    b0, 4   ;7/8
    rjmp    b2010   ; 9     23
;b2011:             ;       22
    addA            ; 2
    doubleP         ; 4
    doHighB 4       ; 6     28
    rjmp    b41     ; 8     30

b_0:                ;       3
    brmi    b_01    ;4/5    5
b_00:               ;
    sbrc    b0, 6   ;5/6
    rjmp    b_001   ; 7     7
b_000:
    ldi     p0, 0   ; 1     6
    ldi     p1, 0   ; 2
    doHighB 7       ; 4
    add     p1, p1  ; 5
    rjmp    nob20   ; 7     13  :-/ -> 15

b_01:               ;       5
    movw    p0, a0  ; 1         useful to both b0:6 branches
    sbrs    b0, 6   ;2/3
    rjmp    b_010   ; 4     9 -1
b_011:              ;       8
    doHighB 7       ; 2         too lazy for more labels
    doubleP         ; 4
    doHighB 6       ; 6     14  :-|
b21:                ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     21
b211:               ;       20
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     23
b2110:              ;       22
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     28
    rjmp    b40     ; 8     30  following two skips - ?

b_10:               ;       4
    sbrs    b0, 6   ;1/2
    rjmp    b_100   ; 3     7
b_101:              ;       6
    movw    p0, a0  ; 1
    doHighB 7       ; 3
    doubleP         ; 5
    addA            ; 7
    doHighB 6       ; 9     15  :-(
;b21:               ;       15
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     22
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     24
;b2110:             ;       23
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     29  two skips, but 
;b40:               ;       29
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b400    ; 7     36      21
;b401:              ;       35  21
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     38      18
;b4011:             ;       37  19
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     43
b61:
    doubleP         ; 2     43  13
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     50      6
b611:               ;       52  6
    doubleP         ; 2
    subA            ; 4
    doHighB 0       ; 6     58      :-((
    ret

b_001:              ;       7
    movw    p0, a0  ; 1
    sbrs    b1, 7   ;2/3
    rjmp    nob001  ;3/4
    add     p1, a0  ; 4
    add     p1, a0  ; 5
nob001:
    doHighB 6       ; 7     14  :-|
;b21:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     21
;b211:              ;       20
    sbrc    b0, 4   ;1/2
    rjmp    b2111   ; 3     23
;b2110:             ;       22
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     28
;b40:               ;       28
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b400    ; 7     35
;b401:              ;       34
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     37      18
;b4011:             ;       36
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     42
;b61:
    doubleP         ; 2     42
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     49
;b611:              ;       48      6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    54
    ret

b_100:              ;       7
    movw    p0, a0  ; 1
    doHighB 7       ; 3
    doubleP         ; 5
    doHighB 6       ; 7     14  :-|
;b20:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b200    ; 7     21
    sbrs    b0, 4   ;7/8
    rjmp    b2010   ; 9     23
;b2011:             ;       22
    addA            ; 2
    doubleP         ; 4
    doHighB 4       ; 6     28
    rjmp    b41     ;       30
#if !expected ; favouring space over expected cycles
b_111:              ;       6 
    clr     p0      ; 1
    inc     b1      ; 2
    clr     p1      ; 3
    sbrc    b1, 7   ;4/5
    mov     p1, a0  ; 5
    add     p1, p1  ; 6
    doHighB 6       ; 8     14  :-|
#else
noB111:             ;       10
    clr     p1      ; 1
    doHighB 6       ; 3     13  :-/
;b21:               ;       13
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     20
;b211:              ;       19
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     22
;b2110:             ;       21
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     27
    rjmp    b40     ; 8     29
b_111:              ;       6 
    clr     p0      ; 1
    inc     b1      ; 2
    brpl    noB111  ;3/4
    mov     p1, a0  ; 4
    add     p1, p1  ; 5
    doHighB 6       ; 7     13  :-/
#endif
;b21:               ;       14
    doubleP         ; 2
    doHighB 5       ; 4
    sbrs    b0, 5   ;5/6
    rjmp    b210    ; 7     21
;b211:              ;       20
    sbrc    b0, 4   ;7/8
    rjmp    b2111   ; 9     23
;b2110:             ;       22
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     28
    rjmp    b40     ; 8     30

b200:               ;       22
    sbrs    b0, 4   ;1/2
    rjmp    b2000   ; 3     25
b2001:              ;       24
    doubleP         ; 4
    doHighB 4       ; 6
    addA            ; 8     30
;b41:               ;       30
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b410    ; 7     37
;b411:              ;
    sbrc    b0, 2   ;7/8
    rjmp    b4111   ; 9     39
;b4110:             ;       38
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     44
;b60:               ;       44
    doubleP         ; 2
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     51
;b601:              ;       50
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     56
    ret

b2000:              ;       25
    doubleP         ; 2
    doHighB 4       ; 4     29
b40:                ;       29 31
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b400    ; 7     36
b401:               ;       35
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     38      18
b4011:              ;       37
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     43
;b61:
    doubleP         ; 2     43
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     50
;b611:              ;       49      6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    55
    ret

b2010:              ;       24
    doubleP         ; 2
    addA            ; 4
    doHighB 4       ; 6     30
;b40:               ;       30
    doubleP         ; 8
    doHighB 3       ; 10
    sbrs    b0, 3   ; 11            XXX
    rjmp    b400    ; 13    37
;b401:              ;       36
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     39      18
;b4011:             ;       38
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     44
;b61:
    doubleP         ; 2     44
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     51
;b611:              ;       50      6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    56
    ret

b210:               ;       21 ? 22
    sbrs    b0, 4   ;1/2
    rjmp    b2100   ; 3     24
;b2101:             ;       24
    doubleP         ; 2
    doHighB 5       ; 4
    subA            ; 6     30
;b41:               ;       30
    doubleP         ; 2
    doHighB 3       ; 4
    sbrs    b0, 3   ;5/6
    rjmp    b410    ; 7     37
;b411:              ;       36
    sbrc    b0, 2   ;1/2
    rjmp    b4111   ; 3     39
;b4110:             ;       38
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     44
;b60:
    doubleP         ; 2     44
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     51
;b601:              ;       50
    doubleP         ; 8
    addA            ; 10
    doHighB 0       ; 12    56
    ret

b2100:              ;       24
    subA            ; 2
    doubleP         ; 4
    doHighB 4       ; 6     30
;b40:               ;       30
    doubleP         ; 8
    doHighB 3       ; 10
    sbrs    b0, 3   ; 11
    rjmp    b400    ; 13    37
;b401:              ;       36  21
    sbrs    b0, 2   ;1/2
    rjmp    b4010   ; 3     39      18
;b4011:             ;       38  18
    addA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     44
;b61:
    doubleP         ; 2     44  12
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     51
;b611:              ;       50  6
    doubleP         ; 8
    subA            ; 10
    doHighB 0       ; 12    56
    ret

b2111:              ;       23
    doubleP         ; 2
    doHighB 4       ; 4     27
    rjmp    b41     ; 6     29


b400:               ;       37  21
    sbrs    b0, 2   ;1/2
    rjmp    b4000   ; 3     40      16
;b4001:             ;       39  19
    doubleP         ; 2
    addA            ; 4
    doHighB 2       ; 6     45
;b61:
    doubleP         ; 2     45  13
    doHighB 1       ; 4
    sbrc    b0, 1   ;5/6
    rjmp    b611    ; 7     52      6
;b610:              ;       51  6
    subA            ; 2
    doubleP         ; 4
    doHighB 0       ; 6     57      :-(
    ret

b4000:              ;       40  16
    doubleP         ; 2
    doHighB 2       ; 4     44
;b60:
    doubleP         ; 2     44  12
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     51
;b601:              ;       50  6
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     56
    ret

b4010:              ;       39  18
    doubleP         ; 2
    addA            ; 4
    doHighB 2       ; 6     45
;b60:               ;       45  12
    doubleP         ; 2
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     52
;b601:              ;       51  6
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     57
    ret

b410:               ;       37  21
    sbrs    b0, 2   ;1/2
    rjmp    b4100   ; 3     40      18
;b4101:             ;       39  18
    doubleP         ; 2
    subA            ; 4
    doHighB 2       ; 6     45
;b61:
    doubleP         ; 2     45  12
    doHighB 1       ; 4
    sbrc    b0, 1   ;5/6
    rjmp    b611    ; 7     52
;b610:              ;       51  6
    subA            ; 2
    doubleP         ; 4
    doHighB 0       ; 6     57      :-(
    ret

b4100:              ;       40  18
    subA            ; 2
    doubleP         ; 4
    doHighB 2       ; 6     46
;b60:
    doubleP         ; 2     46  12
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b600    ; 7     53      4
;b601:              ;       52  6
    doubleP         ; 2
    addA            ; 4
    doHighB 0       ; 6     58      :-((
    ret

b4111:              ;       39  17
    doubleP         ; 2
    doHighB 2       ; 4     43
;b61:
    doubleP         ; 2     43  13
    doHighB 1       ; 4
    sbrs    b0, 1   ;5/6
    rjmp    b610    ; 7     50      6
;b611:              ;       49
    doubleP         ; 2         6
    subA            ; 4
    doHighB 0       ; 6     55
    ret

b600:               ;       51  4
    doubleP         ; 2
    doHighB 0       ; 4     55  ;-)
    ret

b610:               ;       51  6
    subA            ; 2
    doubleP         ; 4
    doHighB 0       ; 6     57
    ret

theEnd:
stuck:
    break
    sleep
    rjmp    stuck
empty: ret
.def    a   = r22
.def    b   = r26
.def    ah  = r23
.def    bh  = r27
.def    p   = r24
.def    ph  = r25
testTest:
    ldi     a, 15
    ldi     ah, 1
    movw    b, a
nextA:
    subi    a, -1
    sbci    ah, -2
    ldi     b, 13
    ldi     bh, 128
    movw    p, a
nextB:
    adiw    b, 31
    movw    a0, a
    movw    b0, b
    rcall   mpy16BEB16
    cp      p0, p
    cpc     p1, ph
    rcall   bad
    add     p, a
    adc     ph, ah
    ldi     a0, 130
    cpi     b, 3
    cpc     bh, a0
    brmi    nextB
    rjmp    nextA

bad:
    ret
    break
    sleep
    rjmp    bad

答案 4 :(得分:0)

作为起点的有符号和无符号8×8→8 / 16,16×16→16/32和8×16→16/24位的相关算法和实现列表:

答案 5 :(得分:0)

在58个周期以下没有得到modified Booth时感到沮丧,粗略地使用预先计算的倍数 - 3 * a。 “中间位对”(可预测?)与我尝试的Booth-2变体完全相同的15个循环,第一个和最后一个需要太长时间。我把它留在了64:

mpy16P316:                  ; 0
;prepare a3h:a3 = 3 * a1:a0 ...
    movP    a3,a3h, a0,a1   ; 2
    double  a3, a3h         ; 4
;   addP    a3,a3h, a0,a1   ; 6         ... by half
    lsl     b0              ; 5         gains speed exactly once
    brcc    _               ;6/7
    brpl    _2              ;7/8
_3:                         ;       7
    addP    a3,a3h, a0,a1   ; 2         other half
    movP    p0,p1, a3,a3h   ; 4     11

_2:                         ;       8
    movP    p0,p1, a3,a3h   ; 2         reason for delay
    addP    a3,a3h, a0,a1   ; 4     12  other half

_:                          ;       7
    brmi    _1              ;1/2    9
_0:                         ;       8
    ldi     p0, 0           ; 1
    ldi     p1, 0           ; 2
    addP    a3,a3h, a0,a1   ; 4     12  other half

_1:                         ;       9
    movP    p0,p1, a0,a1    ; 2
    addP    a3,a3h, a0,a1   ; 4     13

    doHighB 7               ; 2     13
    doHighB 7               ; 4     17

    doHighB 6               ; 2     17
    doubleP                 ; 4         1
    doHighB 5               ; 6
    sbrs    b0, 6           ;7/8
    rjmp    no6             ; 9
    sbrs    b0, 5           ;9/10
    rjmp    no5             ; 11
    doubleP                 ; 12        2
    add3                    ; 14

no5:                        ;       "11"
    addA                    ; 13
no56:
    doubleP                 ; 15    32  2

no6:                        ;       "9"
    sbrs    b0, 5           ;10/11
    rjmp    no56            ; 12
    doubleP                 ; 13        2
    addA                    ; 15    32

; same for 43               ; 15

    doHighB 2               ; 2     47
    doubleP                 ; 4         5
    doHighB 1               ; 6
    sbrs    b0, 2           ;7/8
    rjmp    no2             ; 9
    sbrs    b0, 1           ;9/10
    rjmp    no1             ; 11
    doubleP                 ; 12        6
    add3                    ; 14

no1:                        ;       11
    addA                    ; 13
no12:
    doubleP                 ; 15        6

no2:                        ;       9
    sbrs    b0, 1           ;10/11
    rjmp    no12            ; 12
    doubleP                 ; 13        6
    addA                    ; 15
    doHighB 0               ; 17    64?!

    ret

答案 6 :(得分:0)

第一次尝试计算goto ,基于从Little End开始的Booth-2。比“编织展开的Big-Endian Booth”慢(59对57周期(?)),更小(~401字+返回)。

.equ    base    =   256
; modified Booth from Little End; multiply proper at mpy16LEB16
; 16 snippets reached via computed goto, for 1 multiplier nibble
.org base
; 00 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    doubleA         ;10         low 4 bits done, 6 words to go

;   doHighB 3       ;12         could keep doing this all day ...
;   doubleA         ;14         for no conceivable gain
;;  doHighB 4       ;16         this would be the 1st too many
;   rjmp    hi0+4   ;16 -4
    rjmp    hi0     ;12
b600:               ;       51  squeezing the shortest out and in 
    doubleP         ; 2         branches saves _five_ words
    doHighB 0       ; 4     55  ;-)
    ret
.org base+16
; 00 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12         low 4 bits done, 4 words to go

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+32
    doubleA         ; 2         true Booth or not true Booth
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+48
    subA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+64
; 01 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    addA            ;10
    doubleA         ;12

    doHighB 3       ;14
    rjmp    hi0+2   ;16
.org base+80
; 01 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    addA            ;12
    doubleA         ;14
    rjmp    hi0     ;16
.org base+96
; 01 10
    doubleA         ; 2
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    addA            ;14
    rjmp    hi0     ;16
.org base+112
; 01 11
    doubleA         ; 2
    addA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    addA            ;14
    rjmp    hi0     ;16
.org base+128
; 10 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    doubleA         ;10
    subA            ;12
    doHighB 3       ;14
    rjmp    hi1+2   ;16
.org base+144
; 10 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    subA            ;14
    rjmp    hi1     ;16
.org base+160
; 10 10
    doubleA         ; 2
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    subA            ;12
    doubleA         ;14
    rjmp    hi1     ;16
.org base+176
; 10 11
    doubleA         ; 2
    addA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    subA            ;12
    doubleA         ;14
    rjmp    hi1     ;16
.org base+192
; 11 00
    doubleA         ; 2
    doHighB 1       ; 4
    doubleA         ; 6
    doHighB 2       ; 8
    subA            ;10
    doubleA         ;12
    doHighB 3       ;14
    rjmp    hi1+2   ;16
.org base+208
; 11 01
    addA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    subA            ;12
    doubleA         ;14
    rjmp    hi1     ;16
.org base+224
; 11 10
    doubleA         ; 2
    subA            ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12
    doHighB 3       ;14
    rjmp    hi1+2   ;16
.org base+240
; 11 11
    subA            ; 2
    doubleA         ; 4
    doHighB 1       ; 6
    doubleA         ; 8
    doHighB 2       ;10
    doubleA         ;12         making the fast cases symmetrical
hi1:
    doHighB 3       ; 2     28
    doubleA         ; 4
    doHighB 4       ; 6
; four bits to go, last known to have been 1
    sbrc    b0, 5   ;7/8
    rjmp    sub4_1  ; 9     37
;add4_1
    sbrc    b0, 4   ;9/10
    rjmp    add41   ; 11
;add42                      38
    doubleA         ; 2
    addA            ; 4
;b20                        42
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcc    add6_0  ;8/9    51
;sub6_0:    1b0
    brmi    sub61   ;9/10   52
;sub62                      51
    doubleA         ; 2
    subA            ; 4
    doHighB 7       ; 6     57  _not_ funny
    ret
sub61:              ;       52
    doubleA         ; 2
    addA            ; 4
    doHighB 7       ; 6     58  _not at all_
    ret

add6_0:             ;       51
    brpl    noAS6   ;1/2    53
    addA            ; 3
    doubleA         ; 5
    doHighB 7       ; 7     58  _not at all_
    ret
noAS6:              ;       53
    doubleA         ; 2
    doHighB 7       ; 4     57  _not_ funny
    ret

sub4_1:; bb1b1              37  
    sbrc    b0, 4   ;1/2
    rjmp    sub40   ; 3     40
sub41:              ;       39
    subA            ; 2
sub40:              ;       41
    doubleA         ; 4     43
;b21                        43
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcs    sub6_1  ;8/9    52
;add6_1:    0b1
    brpl    add61   ;9/10   53
;add62                      52
    doubleA         ; 2
    addA            ; 4
    doHighB 7       ; 6     58  _not_ funny
    ret
add61:              ;       53
    addA            ; 2
    doubleA         ; 4
    doHighB 7       ; 6     59  _not at all_
    ret

add4_0:; bb0b0              37  
    sbrs    b0, 4   ;1/2
    rjmp    add40   ; 3     40
add41:              ;       39
    addA            ; 2
add40:              ;       41
    doubleA         ; 4     43
;b20                        43
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcc    add6_0  ;8/9    52
;sub6_0:    1b0
    brmi    sub61   ;9/10   53
;sub62                      52
    doubleA         ; 2
    subA            ; 4
    doHighB 7       ; 6     57  _not_ funny
    ret
sub6_1:             ;       51
    brmi    noAS6   ;1/2    53
    subA            ; 3
    doubleA         ; 5
    doHighB 7       ; 7     58  _not at all_
    ret 

hi0:
    doHighB 3       ; 2     28
    doubleA         ; 4
    doHighB 4       ; 6
; four bits to go, last known to have been 0
    sbrs    b0, 5   ;7/8
    rjmp    add4_0  ; 9
;sub4_0
    sbrs    b0, 4   ;9/10
    rjmp    sub41   ; 11    39
;sub42                      38
    doubleA         ; 2
    subA            ; 4
;b21                        42
    doHighB 5       ; 2
    doubleA         ; 4
    doHighB 6       ; 6
    lsl     b0      ; 7
    brcs    sub6_1  ;8/9    51
;add6_1:    0b1
    brpl    add61   ;9/10   52
;add62                      51
    doubleA         ; 2
    addA            ; 4
    doHighB 7       ; 6     57  _not_ funny
    ret

.equ    code    =   high(base)
mpy16LEB16:         ;       0   modified Booth from Little End
    mov     ZL, b0  ; 1
    andi    ZL, 15  ; 2
    swap    ZL      ; 3
    ldi     ZH, code; 4
    ldi     p0, 0   ; 5
    ldi     p1, 0   ; 6
    sbrc    b0, 7   ;7/8
    add     p1, a0  ; 8
    doHighB 0       ;10
    ijmp            ;12

答案 7 :(得分:0)

接近空间意识的实施(参考,如果不是理智) 使用的资源应该是合格的(g:狂野猜测,G:猜测,e:有根据的猜测,E:估计,s:模拟,a:分析,A:分析和证实,如果通过模拟,m:测量)(字×worstCaseCycleCount是一种类似于IC设计中的面积×延迟的成本测量(“优点”的单个数字?)

$('select[name=category]').change(function () {
    console.log($(":selected", this).text());
});

(我不止一次检查过相同的“wordcycle entries”。)
宏,应该可以理解为

algoritm            bits    cycles words   regs  remarks
                            wc exp   ×wccc excl.
                                           a,b,p
shift factor left  16×16→16(61 56  87 5307       see other
                            62 57  62 3844       answer)
                            73 68  37 2701
                            81 77  24 1944       (see edit history)
                            85 70g 15 1275       w*expcc~1050
                           108 64g 18 1944       w*expcc~1150
(jump table, for reference  51E49g 888e 44K G   (almost done)
                            44E39g2888E127K e)

16×16→16位,85/81周期,15/24字:

.MACRO doubleA  ;   adds (shifts/weights) factor "a"
    add     a0, a0  ; +1
    adc     a1, a1  ; +2
.EndM
.MACRO doHighB  ;   "does" bit in b1, bit number as a parameter
    sbrc    b1, @0  ; 1
    add     p1, a0  ; 2
.EndM
.MACRO condAdd
    doHighB @0      ; +2
    sbrs    b0, @1  ; +3
    rjmp    PC+3    ;+4/5
    addA            ; +6
.EndM
.MACRO step16; "do" 2 bits, bit# in b1 and b0 as a parameter
    condAdd @0, @0  ; +6
    doubleA         ; +8
.EndM

16×16→16位,73个周期,37个字:

mpy16x16:           ;       0
    clr     p0      ; 1
    clr     p1      ; 2
; wanting early out: shifting the factor; faster from Little End
    lsr     b0      ; 3
    brcc    shiftB1 ;4/5
addFull:
    addA            ; 2
shiftB1:            ;       due to handling this 2nd multiplier
    lsr     b1      ; 3     bit even if the multiplicand is zero
    brcc    pc+2    ;4/5    after the first shift, the earlyOutA
addHigh:            ;       variant is 3 cycles slower than 4.8
    add     p1, a0  ; 5     libgcc __mulhi3 - for * 0 or 0x8000
shiftA:
    doubleA         ; 7         why is adc zero-flag handling ...
#if 1||earlyOutA
    brne    shiftB0 ;+1/2   7   ... different from subc/sbci/cpc?
    tst     a0      ;+ 2
    breq    done    ;+ 3/-1upto-69?
#endif
shiftB0:
    lsr     b0      ; 8
    brcs    addFull ;9/10
    sbci    b1, 0   ; 10    presume zero or high reg?
    brne    shiftB1 ;11/12-2
done:               ; wc:   8*10+5=85   @15+1 words (?!)
    ret             ; best: 14 (0=b&0xfffe) (none for a)
                    ;(earlyOutA: wc: 8*13+4=108 @18+1 words)

答案 8 :(得分:0)

最后,No Holds Barred版本,如果不是最终状态。 哦,答案体不应该超过30000个字符,未经编辑的来源大约是55K - 稍后。 有点大~2900字,快(&lt; = 44循环,预期~39)。

2015-06-26 14:16:16.042 ibtoold[16859:662960] [MT] DVTAssertions: ASSERTION FAILURE in /Library/Caches/com.apple.xbs/Sources/IDEInterfaceBuilder/IDEInterfaceBuilder-8121.17/InterfaceBuilderKit/Document/Platform/IBIdiom.m:105
Details:  Assertion failed: !
Object:   <IBIPadIdiom: 0x7f8f53e1cb50>
Method:   -filePathWithTargetDeviceSuffixForBaseFilePath:
Thread:   <NSThread: 0x7f8f53c09d80>{number = 1, name = main}
Hints: None
Backtrace:
  0  0x0000000104ba2ff9 -[DVTAssertionHandler handleFailureInMethod:object:fileName:lineNumber:assertionSignature:messageFormat:arguments:] (in DVTFoundation)
  1  0x0000000104ba2b2d _DVTAssertionHandler (in DVTFoundation)
  2  0x0000000104ba2d44 _DVTAssertionFailureHandler (in DVTFoundation)
  3  0x0000000104ba2ca6 _DVTAssertionFailureHandler (in DVTFoundation)
  4  0x0000000105c972b5 -[IBIdiom filePathWithTargetDeviceSuffixForBaseFilePath:] (in IDEInterfaceBuilderKit)
  5  0x000000010f8e6904 (in IDEInterfaceBuilderCocoaTouchIntegration)
  6  0x0000000105b66866 -[IBDocument finishCompilingWithOutputPath:options:error:] (in IDEInterfaceBuilderKit)
  7  0x0000000104961ff6 (in ibtoold)
  8  0x00000001049644dc (in ibtoold)
  9  0x0000000104967f15 (in ibtoold)
 10  0x00000001049685c9 (in ibtoold)
 11  0x00000001049684a0 (in ibtoold)
 12  0x0000000104957bab (in ibtoold)
 13  0x0000000104967b0a (in ibtoold)
 14  0x0000000104966d6f (in ibtoold)
 15  0x00007fff890825c9 start (in libdyld.dylib)
Command /Applications/Xcode-beta.app/Contents/Developer/usr/bin/ibtool failed with exit code 255

...

star_t:
    rjmp    testTest
.org    0x20

.def    a0  = r16   ; addend low byte
.def    a1  = r17
.def    m0  = r18   ; multiplier low byte
.def    m1  = r19
.def    p0  = r20   ; (partial) product low byte
.def    p1  = r21
.def    _zero=r1
.def    tmp = r0
; some macros using factors a1:a0, m1:m0 and product p1:p0
.MACRO addA     ;   adds (weighted) factor "a" into product
    add     p0, a0  ; +1
    adc     p1, a1  ; +2
.EndM
.MACRO subA     ;   subtracts (weighted) factor "a" from product
    sub     p0, a0  ; +1
    sbc     p1, a1  ; +2
.EndM
.MACRO doubleP  ;   adds (shifts/weights)(partial) product
    add     p0, p0  ; +1
    adc     p1, p1  ; +2
.EndM

settle:
    inc     m1      ; 1         _looks_ smarter than  add p1, a0
doM1:
    clr     XH      ; 2         ?
; mov a0, tmp
    ldi     ZH, high(highs); 3
    mov     ZL, m1  ; 4
    ijmp            ; 6
mpy1616:            ; 0
    movw    XL, a0  ; 1
    andi    XH, 15  ; 2
    eor     XH, XL  ; 3         XH = a0 ^ (a1 & 15)
    andi    XL, 15  ; 4
; swapping first could use -+ (dropping carry) in stead of ex-or
    swap    XL      ; 5         XL = a0 << 4
; _if_ XH was used in few worst loxx cases, do these "on demand"
    swap    XH      ; 6         XH = ((a1^a0)<<4)|(a0>>4)
    eor     XH, XL  ; 7         XH = (a1<<4)|(a0>>4)
; mov tmp, a0
    movw    p0, a0  ; 1         other way 'round with gcc ABI?
    ldi     ZH, high(jump); 1
    mov     ZL, m0  ; 2
trampoline:
    ijmp            ; 14 (12+2) + 15 + 8 + 7 - 44? really?

...

#define done    ret
hi07:
    add     p1, a0  ; 5
hi06:
    add     a0, a0  ; 4
hi03:
    add     p1, a0  ; 3
hi02:
    add     p1, a0  ; 2
hi01:
    add     p1, a0  ; 1
;hi00:
    done
hi0a:
    add     a0, a0  ; 5
hi05:
    add     p1, a0  ; 4
hi04:
    add     a0, a0  ; 3
    add     a0, a0  ; 2
    add     p1, a0  ; 1
    done
hi09:
    add     p1, a0  ; 5
hi08:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    add     a0, a0  ; 2
    add     p1, a0  ; 1
    done
hi0b:
    sub     p1, a0  ; 5
hi0c:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    sub     p1, a0  ; 2
    add     p1, XL  ; 2
    done

hi0d:
    sub     p1, a0  ; 4
hi0e:
    sub     p1, a0  ; 3
hi0f:
    sub     p1, a0  ; 2
    add     p1, XL  ; 1
    done

hi17:
    add     p1, a0  ; 6
hi16:
    add     a0, a0  ; 5
hi13:
    add     p1, a0  ; 4
hi12:
    add     p1, a0  ; 3
hi11:
    add     p1, a0  ; 2
hi10:
    add     p1, XL  ; 1
    done
;hi1a:
;   add     a0, a0  ; 6
hi15:
    add     p1, a0  ; 5
hi14:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    add     p1, a0  ; 2
    add     p1, XL  ; 1
    done
hi19:
    add     p1, a0  ; 6
hi18:
    add     p1, XL  ; 5
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    add     p1, a0  ; 2
    add     p1, a0  ; 1
    done
hi1a:
    sub     p1, a0  ; 6
hi1b:
    sub     p1, a0  ; 5
hi1c:
    add     a0, a0  ; 4
    sub     XL, a0  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi1d:
    sub     p1, a0  ; 5
hi1e:
    sub     p1, a0  ; 4
hi1f:
    sub     p1, a0  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done

hi27:
    add     p1, a0  ; 7
hi26:
    add     a0, a0  ; 6
hi23:
    add     p1, a0  ; 5
hi22:
    add     p1, a0  ; 4
hi21:
    add     p1, a0  ; 3
hi20:
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi2a:
    add     a0, a0  ; 6
hi25:
    add     p1, a0  ; 5
hi24:
    add     XL, a0  ; 4
    add     XL, a0  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
; ...
hi4d:
    sub     p1, a0  ; 7
hi4e:
    sub     p1, a0  ; 6
hi4f:
    sub     p1, a0  ; 5
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done

hi53:;
    add     p1, a0  ; 7
hi52:;
    add     p1, a0  ; 6
hi51:;
    add     p1, a0  ; 5
hi50:;
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi56:;
    add     p1, a0  ; 7
; ...
hi5d:;
    sub     p1, a0  ; 7
hi5e:;
    sub     p1, a0  ; 6
hi5f:;
    sub     p1, a0  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done

hi63:;
    add     p1, a0  ; 7
hi62:;
    add     p1, a0  ; 6
hi61:;
    add     p1, a0  ; 5
hi60:;
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
; ...
hi68:;
    add     p1, a0  ; 7
hi67:;
    add     p1, a0  ; 6
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi6a:;
    add     p1, a0  ; 7
hi69:;              ;           105 ~ 15 * 7
    sub     XL, a0  ; 6
    add     p1, XL  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi6b:; no symmetry
    sub     p1, a0  ; 7
hi6c:; no symmetry
    add     XL, a0  ; 6
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    add     p1, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi6d:; no symmetry
;01101101
    sub     p1, a0  ; 6
hi6e:;
    sub     p1, a0  ; 5
hi6f:;
    sub     p1, a0  ; 4
    sub     p1, XL  ; 3
    sbrc    a0, 0   ; 2
    subi    p1, -128; 1
    done

hi73:;
    add     p1, a0  ; 6
hi72:;
    add     p1, a0  ; 5
hi71:;
    add     p1, a0  ; 4
hi70:;
    sub     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi75:; not quite symmetrical
    add     p1, a0  ; 7
hi74:;
    add     a0, a0  ; 6
    add     p1, a0  ; 5
    add     p1, a0  ; 4
    sub     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi76:;
    sub     p1, a0  ; 7
hi77:;
    sub     p1, a0  ; 6
hi78:;              ;           120 ~ 15 * 8
    sub     XL, a0  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi7b:;
    sub     p1, a0  ; 6
hi7c:;
    add     XL, XL  ; 5
    sub     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi79:
    sub     p1, a0  ; 7
hi7a:
    add     a0, a0  ; 6
hi7d:
    sub     p1, a0  ; 5
hi7e:
    sub     p1, a0  ; 4
hi7f:
    sub     p1, a0  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done

hi85:
    add     p1, a0  ; 7
hi84:
    add     p1, a0  ; 6
hi83:
    add     p1, a0  ; 5
hi82:
    add     p1, a0  ; 4
hi81:
    add     p1, a0  ; 3
hi80:
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi86:
    sub     p1, a0  ; 7
hi87:; not quite symmetrical    135 ~ 15 * 9
    sub     XL, a0  ; 6
    add     p1, XL  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi8a:;
    add     p1, a0  ; 7
hi89:;
    add     p1, a0  ; 6
hi88:;
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi8b:; not quite symmetrical
    sub     p1, a0  ; 7
hi8c:; not quite symmetrical
    add     p1, XL  ; 6
    add     XL, XL  ; 5
    sub     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
    done
hi8d:
    sub     p1, a0  ; 6
hi8e:
    sub     p1, a0  ; 5
hi8f:
    sub     p1, a0  ; 4
    add     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done

hi93:               ;           147 7*7*3
    add     p1, a0  ; 6
hi92:
    add     p1, a0  ; 5
hi91:
    add     p1, a0  ; 4
hi90:
    add     p1, XL  ; 3
    sbrc    XL, 4   ; 2
    subi    p1, -128; 1
    done
hi95:
    add     p1, a0  ; 7
hi94:; no symmetry
    add     p1, XL  ; 6
    add     XL, XL  ; 5
    add     XL, a0  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi96:;              ;           150 ~ 15 * 10   nananananaana
    sub     p1, a0  ; 7
hi97:
    sub     XL, a0  ; 6         151 ~ (256-)15*-7
    sub     p1, XL  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hi98:;
    sub     p1, a0  ; 7
hi99:;              ;           153 ~ 17 * 9
    add     XL, a0  ; 6
    add     p1, XL  ; 5
    add     XL, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hi9c:;
    add     p1, a0  ; 7
hi9b:;
    add     p1, a0  ; 6
hi9a:;
    add     XL, a0  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hi9d:;              ;           157
    sub     p1, a0  ; 7
hi9e:;
    sub     p1, a0  ; 6
hi9f:;
    sub     p1, a0  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done

hia3:;
    add     p1, a0  ; 7
hia2:;
    add     p1, a0  ; 6
hia1:;
    add     p1, a0  ; 5
hia0:;
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hia4:;
    sub     p1, a0  ; 7
hia5:;              ;           165 ~ 15 * 11
    add     XL, XL  ; 6
    add     XL, a0  ; 5
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     p1, XL  ; 2
    add     p1, XL  ; 1
    done
hia7:;
    add     p1, a0  ; 6
hia6:;
    sub     XL, a0  ; 5
    add     XL, XL  ; 4
    sub     p1, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
hia9:;
    add     p1, a0  ; 7
hia8:;
    sub     p1, XL  ; 6
    add     XL, a0  ; 5
    add     XL, a0  ; 4
    add     XL, XL  ; 3
    sub     p1, XL  ; 2
    sub     p1, XL  ; 1
    done
hiaa:;
#if greedy
    add     a0, a0  ; 6
    add     XL, XL  ; 5
    add     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    add     p1, XL  ; 1
#else
    sub     p1, a0  ; 7
#endif
hiab:;
    sub     p1, a0  ; 6
hiac:;
    sub     p1, XL  ; 5
    add     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
hiad:;
    sub     p1, a0  ; 7
hiae:;
    sub     p1, a0  ; 6
hiaf:;
    sub     p1, a0  ; 5
    sub     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done

hib3:
    add     p1, a0  ; 7
hib2:
    add     p1, a0  ; 6
hib1:
    add     p1, a0  ; 5
hib0:;
    sub     p1, XL  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
hib5:
    add     p1, a0  ; 6
hib4:
    sub     p1, XL  ; 5
    sub     XL, a0  ; 4
    add     XL, XL  ; 3
    add     XL, XL  ; 2
    sub     p1, XL  ; 1
    done
; ...
hieb:               ;               ouch
    sub     p1, a0  ; 5
hiec:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    sub     p1, a0  ; 2
    sub     p1, XL  ; 1
    done
hie9:
    sub     p1, a0  ; 6
hiea:
    add     a0, a0  ; 5
hied:
    sub     p1, a0  ; 4
hiee:
    sub     p1, a0  ; 3
hief:
    sub     p1, a0  ; 2
    sub     p1, XL  ; 1
    done

hif5:
    add     p1, a0  ; 6
hif4:
    add     p1, a0  ; 5
hif3:
    add     p1, a0  ; 4
hif2:
    add     p1, a0  ; 3
hif1:
    add     p1, a0  ; 2
hif0:
    sub     p1, XL  ; 1
    done
hif6:
    sub     p1, a0  ; 6
hif7:
    sub     p1, a0  ; 5
hif8:
    add     a0, a0  ; 4
    add     a0, a0  ; 3
    sub     p1, a0  ; 2
    sub     p1, a0  ; 1
    done

...

.org (PC + 0x100) & 0xffff00
highs:
;   rjmp    hi00
    done;-) to start code with a ret-insn, move this table first
    rjmp    hi01
    rjmp    hi02
    rjmp    hi03
    rjmp    hi04
    rjmp    hi05
    rjmp    hi06
    rjmp    hi07
    rjmp    hi08
    rjmp    hi09
    rjmp    hi0a
    rjmp    hi0b
    rjmp    hi0c
    rjmp    hi0d
    rjmp    hi0e
    rjmp    hi0f

    rjmp    hi10
    rjmp    hi11
    rjmp    hi12
    rjmp    hi13
; ...
    rjmp    hiee
    rjmp    hief

    rjmp    hif0
    rjmp    hif1
    rjmp    hif2
    rjmp    hif3
    rjmp    hif4
    rjmp    hif5
    rjmp    hif6
    rjmp    hif7
    rjmp    hif8
;   rjmp    hif9    ;           jmp + 4 adds + 1 sub
    sub     p1, a0  ; 7
;   rjmp    hifa    ;           jmp + 3 adds + 2 subs -lutin?
    sub     p1, a0  ; 6
;   rjmp    hifb    ;           jmp + 2 adds + 2 subs
    sub     p1, a0  ; 5
;   rjmp    hifc
    sub     p1, a0  ; 4
;   rjmp    hifd
    sub     p1, a0  ; 3
;   rjmp    hife
    sub     p1, a0  ; 2
;   rjmp    hiff
    sub     p1, a0  ; 1
    done

#undef done
#define done    rjmp doM1
#define owing   rjmp settle

.org (PC + 0x100) & 0xffff00
jump:
    rjmp    lo00
    done        ; rjmp  lo01
    rjmp    lo02

;(you know the drill)

    rjmp    lofe
;   rjmp    loff
.Macro negP
    com     p1
    neg     p0
    sbci    p1, -1
.EndM
    negP    ; 6
    owing

.Macro add4
    add p0, XL
    adc p1, XH
.EndM
.Macro sub4
    sub p0, XL
    sbc p1, XH
.EndM
.Macro set4
    movw    p0, XL
.EndM
.Macro pp2Z
    movw    ZL, p0
.EndM
.Macro addZ
    add p0, ZL
    adc p1, ZH
.EndM
.Macro subZ
    sub p0, ZL
    sbc p1, ZH
.EndM
.Macro clrP
    clr p0
    clr p1
.EndM

...

; do not tail merge to keep *-2/-1/0/1...16 fast
; (with 11, 13, 14, 17, 18 & 19 as collateral benefit, really)
lo00:
    clr p0  ; 4
    clr p1  ; 3
    done
lo07:
    addA    ;10
lo05:
    addA    ; 8
lo03:
    doubleP ; 6
lo02:
    addA    ; 4
;lo01:
    done    ; 2
lo08:
    addA    ; 8
lo04:
    doubleP ; 6
;lo02:
    doubleP ; 4
    done    ; 2
lo0a:
    doubleP ;10
lo06:
    doubleP ; 8
    addA    ; 6
    doubleP ; 4
    done    ; 2
lo09:
    doubleP ;10
    doubleP ; 8
    doubleP ; 6
    addA    ; 4
    done    ; 2
lo0b:
    doubleP ;12 d
    doubleP ;10 a0
    addA    ; 8 d
    doubleP ; 6 d
    addA    ; 4 s0
    done    ; 2
lo0c:
    doubleP ;10
    addA    ; 8
    doubleP ; 6
    doubleP ; 4
    done    ; 2
lo0d:
    addA    ;11
lo0e:
    addA    ; 9
lo0f:
    negP    ; 7
    add4    ; 4
    done    ; 2
lo10:
    set4    ; 1
    done
lo15:
    addA    ;12
lo14:
    addA    ;10
lo13:
    addA    ; 8
lo12:
    addA    ; 6
lo11:
    add4    ; 4
    done
lo16:
    doubleP ;10
    addA    ; 8
    doubleP ; 6
    add4    ; 4
    done
lo17:
    doubleP ;12
    doubleP ;10
    doubleP ; 8
    subA    ; 6
    add4    ; 4
    done
lo18:
    doubleP ;10
    doubleP ; 8
    doubleP ; 6
    add4    ; 4
    done
lo19:
    doubleP ;12
    doubleP ;10
    doubleP ; 8
    addA    ; 6
    add4    ; 4
    done
lo1a:       ; ...++.+.
    doubleP ;12
    doubleP ;10
    addA    ; 8
    doubleP ; 6
    add4    ; 4
    done
lo1b:       ; ..+..-.-
    negP    ;13
    doubleP ;10
    add4    ; 8
    doubleP ; 6
    subA    ; 4
    done
lo1c:       ; ..+..-..
    negP    ;11
    doubleP ; 8
    add4    ; 6
    doubleP ; 4
    done
lo1d:
    subA    ;14
lo1e:
    subA    ;12
lo1f:
    subA    ;10
lo20:
    subA    ; 8
    add4    ; 6
    add4    ; 4
    done

lo25:
    addA    ;14
lo24:
    addA    ;12
lo23:
    addA    ;10
lo22:
    addA    ; 8
lo21:
    add4    ; 6
    add4    ; 4
    done

...

lo6a:
    doubleP ;14
    add4    ;12
    doubleP ;10
    addA    ; 8
    add4    ; 6
    doubleP ; 4
    done    ; 2
lo6b:; .++.++.- .+++.-.-   .++.++.-?+..+.-.-?
;01101011
;XXX wc, faster without preparational p = a
    addA    ;15 set4    ;15 addA 16 ?set4 15
    add4    ;13 subA    ;12 a4      ?d  ?
    pp2Z    ;11 doubleP ;10 d       ?s0 ?
    doubleP ;10 add4    ; 8 a0      ?d  ?
    addZ    ; 8 doubleP ; 6 a4      ?d  ?
    doubleP ; 6 add4    ; 8 d       ?s0 ?
    subA    ; 4 subA    ; 4 s0      ?a4 ?
    done    ; 2
lo6c:       ; .+++.-..
    set4    ;13 a0
    subA    ;12 a0
    doubleP ;10 a4
    add4    ; 8 d
    doubleP ; 6 a4
    add4    ; 4 d
    done    ; 2
lo6d:       ; .+++..--
    set4    ;14
    add4    ;13
    subA    ;11
    pp2Z    ; 9
    doubleP ; 8
    add4    ; 6
    addZ    ; 4
    done    ; 2

; ...

lo9a:       ; +.
    add4    ;14 nP  14
    doubleP ;12 s4  12
    doubleP ;10 Z   10
    addA    ; 8 d   9
    doubleP ; 4 d   7
    add4    ; 4 aZ  5
    done    ; 2
lo9b:; +.+..-.- +.+..-.-    +.+.+.++    .-.-.-.-
    set4    ;14 negP    ;16 add4    ;16 negP15
    doubleP ;13 add4    ;13 doubleP ;14 s4  12
    subA    ;11 add4    ;11 doubleP ;12 Z   10
    pp2Z    ; 9 pp2Z    ; 9 add4    ;10 d   9
    doubleP ; 8 doubleP ; 8 addA    ; 8 d   7
    doubleP ; 6 doubleP ; 6 doubleP ; 6 aZ  5
    addZ    ; 4 addZ    ; 4 addA    ; 4 owi 3
    done    ; 2
lo9c:       ; +.+..-..
    set4    ;13
    doubleP ;12
    subA    ;10
    doubleP ; 8
    add4    ; 6
    doubleP ; 4
    done    ; 2
lo9d:; +..+++.+ .--...-- +.+...--
;10011101
;XXX wc, faster without preparational p = a
    add4    ;15 set4    ;15 negP    ;15
    pp2Z    ;13 doubleP ;14 sub4    ;12
    doubleP ;12 doubleP ;12 sub4    ;10
    addA    ;10 add4    ;10 pp2Z    ; 8
    doubleP ; 8 subA    ; 8 doubleP ; 7
    doubleP ; 6 doubleP ; 6 addZ    ; 5
    addZ    ; 4 subA    ; 4 owing   ; 3
    done

; ...

loaa:       ; +.+.+.+.
    add4    ;13 add4    ;14 d   14
    pp2Z    ;11 doubleP ;12 s4  12
    doubleP ;10 doubleP ;10 Z   10
    doubleP ; 8 add4    ; 8 d   9
    addZ    ; 6 doubleP ; 4 d   7
    doubleP ; 4 done    ; 2 aZ  5
    done    ; 2
loab:; .-.-.-.- +.+..-.-    +.+.+.++    .-.-.-.-
;10101011
;XXX wc, faster without preparational p = a
    negP    ;15 set4    ;15 add4    ;16 negP16
    sub4    ;12 doubleP ;14 doubleP ;14 s4  13
    pp2Z    ;10 subA    ;12 doubleP ;12 d   11
    doubleP ; 9 doubleP ;10 add4    ;10 a4  9
    doubleP ; 7 add4    ; 8 addA    ; 8 d   7
    addZ    ; 5 doubleP ; 6 doubleP ; 6 s0  5
    owing   ; 3 subA    ; 4 addA    ; 4 owi 3
loac:
    add4    ;14
    doubleP ;12
    addA    ;10
    doubleP ; 8
    add4    ; 6
    doubleP ; 4
    done    ; 2
load:       ; .-.-..--  .-.-.-.+    .--.++.+    0.9 1.8 0.8 (avg)
; WC    10101101
    negP    ;15 -1      negP    ;16 a4  a0  a0  17
    sub4    ;12 -16-1   sub4    ;13 d   s4  a0
    pp2Z    ;10 -16-1   doubleP ;11 a0  Z   s4
    sub4    ; 9 -32-1   doubleP ; 9 d   d   d
    doubleP ; 7 -64-2   sub4    ; 7 a4  aZ  d
    addZ    ; 5 -80-3   addA    ; 5 d   d   a0
    owing   ; 3 owing   ; 3 a0  a0  s4
loae:       ; .-.-..-.
    negP    ;14 clrP    ;15
    sub4    ;11 sub4    ;13
    sub4    ; 9 doubleP ;11
            ;   subA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
loaf:       ; .-.-...-      nutritious
;10101111
;XXX wc, faster without preparational p = a
    clrP    ;15 s
    sub4    ;13 d
    doubleP ;11 d
    doubleP ; 9 a4
    sub4    ; 7 d
    subA    ; 5 a4
    owing   ; 3 s0

lob0:       ;               don't call me that
    clrP    ;13
    sub4    ;11
    doubleP ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob1:
;XXX wc, faster without preparational p = a
    clrP    ;15 s
    sub4    ;13 d
    doubleP ;11 d
    doubleP ; 9 a4
    sub4    ; 7 d
    addA    ; 5 a4
    owing   ; 3 a0
lob2:
;XXX wc, faster without preparational p = a
    clrP    ;15
    sub4    ;13
    doubleP ;11
    addA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob3:
    sub4    ;13
    doubleP ;11
    doubleP ; 9
    sub4    ; 7
    subA    ; 5
    owing   ; 3
; ...
lob6:
    sub4    ;13
    doubleP ;11
    addA    ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob7:; .-.-+.++ ++.-+..-    .-.-+..-   +.+++..- ++..-..- .-..-..-
    sub4    ;14 add4    ;15 doubleP ;15 a4  16  n   17   n  16  s4
    pp2Z    ;12 pp2Z    ;13 sub4    ;13 d       a4       d  13  z
    doubleP ;11 doubleP ;12 doubleP ;11 d       d        s4 11  d
    addA    ; 9 add4    ;10 doubleP ; 9 a4      a4       d  9   a0
    doubleP ; 7 doubleP ; 8 sub4    ; 7 d       d        d  7   d
    addZ    ; 5 doubleP ; 6 subA    ; 5 a4      d        s0 5   az
    owing   ; 3 subZ    ; 4 owing   ; 3 s0      s0       owi    owi
lob8:
    doubleP ;13
    sub4    ;11
    doubleP ; 9
    doubleP ; 7
    sub4    ; 5
    owing   ; 3
lob9:
    sub4    ;14
    pp2Z    ;12
    addA    ;11
    doubleP ; 9
    doubleP ; 7
    addZ    ; 5
    owing
loba:       ; .-...--.
    negP    ;14
    sub4    ;11
    doubleP ; 9
    subA    ; 7
    doubleP ; 5
    owing   ; 3
lobb:
    negP    ;14
    sub4    ;11
    doubleP ; 9
    doubleP ; 7
    subA    ; 5
    owing   ; 3
lobc:
    negP    ;12
    sub4    ; 9
    doubleP ; 7
    doubleP ; 5
    owing   ; 3
lobd:; .-...-.+
    negP    ;14 set4    ;15
    sub4    ;11 doubleP ;14
    doubleP ; 9 subA    ;12
    doubleP ; 7 doubleP ;10
    addA    ; 5 add4    ; 8
    owing   ; 3 doubleP ; 6
            ;   subA    ; 4
            ;   done    ; 2
lobe:       ;           Honni soit qui mal y pense !
    negP    ;12
    sub4    ; 9
    sub4    ; 7
    doubleP ; 5
    owing   ; 3
lobf:       ;
    subA    ;15 clrP sub4 d d suba owing 13
loc0:       ;               DONT't call me THAT!
    subA    ;13
loc1:
    sub4    ;11
    sub4    ; 9
    sub4    ; 7
    sub4    ; 5
    owing

...

loca:
    add4    ;14
    doubleP ;12
    add4    ;10
    doubleP ; 8
    addA    ; 6
    doubleP ; 4
    done    ; 2
locb:; .-..+.++ ..--.-.- ++..++.-
; WC?!? 11001011
    doubleP ;15 n   16   a4 15  s   17  n   15
    sub4    ;13 d   13   z  13  d   16  s4  12
    doubleP ;11 s4  11   d  12  a4  14  z   10
    addA    ; 9 d   9    aZ 10  s0  12  s0  9
    doubleP ; 7 s4  7    d  8   d   10  d   7
    addA    ; 5 s0  5    d  6   d   8   az  5
    owing   ; 3 owi 3    s0 4   a4  6
            ;                   s0  4
locc:       ;           what comments?
    add4    ;13 n   14
    pp2Z    ;11 s4  11
    doubleP ;10 d   9
    addZ    ; 8 d   7
    doubleP ; 6 a4  5
    doubleP ; 4 owi 3
    done    ; 2