Question

我正在寻找一种算法来乘以两个比下面更好的整数。你有一个好主意吗？（MCU - AT Tiny 84/85或类似 - 此代码运行的地方没有mul / div运算符）

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint16_t res=0;

    while (b) {
        if ( (b & 1) )
            res+=a;
        b>>=1;
        a+=a;
    }

    return res;
}

当使用avr-gcc编译器为AT Tiny 85/84编译时，此算法几乎与avr-gcc生成的算法__mulhi3相同。

avr-gcc算法：

00000106 <__mulhi3>:
 106:   00 24           eor r0, r0
 108:   55 27           eor r21, r21
 10a:   04 c0           rjmp    .+8         ; 0x114 <__mulhi3+0xe>
 10c:   08 0e           add r0, r24
 10e:   59 1f           adc r21, r25
 110:   88 0f           add r24, r24
 112:   99 1f           adc r25, r25
 114:   00 97           sbiw    r24, 0x00   ; 0
 116:   29 f0           breq    .+10        ; 0x122 <__mulhi3+0x1c>
 118:   76 95           lsr r23
 11a:   67 95           ror r22
 11c:   b8 f3           brcs    .-18        ; 0x10c <__mulhi3+0x6>
 11e:   71 05           cpc r23, r1
 120:   b9 f7           brne    .-18        ; 0x110 <__mulhi3+0xa>
 122:   80 2d           mov r24, r0
 124:   95 2f           mov r25, r21
 126:   08 95           ret

umul16_算法：

00000044 <umul16_>:
  44:   20 e0           ldi r18, 0x00   ; 0
  46:   30 e0           ldi r19, 0x00   ; 0
  48:   61 15           cp  r22, r1
  4a:   71 05           cpc r23, r1
  4c:   49 f0           breq    .+18        ; 0x60 <umul16_+0x1c>
  4e:   60 ff           sbrs    r22, 0
  50:   02 c0           rjmp    .+4         ; 0x56 <umul16_+0x12>
  52:   28 0f           add r18, r24
  54:   39 1f           adc r19, r25
  56:   76 95           lsr r23
  58:   67 95           ror r22
  5a:   88 0f           add r24, r24
  5c:   99 1f           adc r25, r25
  5e:   f4 cf           rjmp    .-24        ; 0x48 <umul16_+0x4>
  60:   c9 01           movw    r24, r18
  62:   08 95           ret

<小时/> 修改：指令集可用here。

Answer 1

<强>摘要

考虑交换a和b（原始提案）
试图避免条件跳转（未成功优化）
重新输入输入公式（估计增加35％）
删除重复的班次
展开循环：＆＃34;最佳＆＃34;装配
说服编译器提供最佳装配

1。考虑交换a和b

一个改进是首先比较a和b，并在a<b时交换它们：你应该使用b中较小的一个，这样你就有了最小的周期数。请注意，您可以通过复制代码来避免交换（if (a<b)然后跳转到镜像代码部分），但我怀疑它是否值得。

2。试图避免条件跳转（未成功优化）

尝试：

uint16_t umul16_(uint16_t a, uint16_t b)
{
    ///Here swap if necessary
    uint16_t accum=0;

    while (b) {
        accum += ((b&1) * uint16_t(0xffff)) & a; //Hopefully this multiplication is optimized away
        b>>=1;
        a+=a;
    }

    return accum;
}

根据塞尔吉奥的反馈，这并没有带来改善。

第3。重新输入输入公式

考虑到目标体系结构基本上只有8位指令，如果分开输入变量的上下8位，你可以写：

a = a1 * 0xff + a0;
b = b1 * 0xff + b0;

a * b = a1 * b1 * 0xffff + a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

现在，很酷的是我们可以抛弃术语a1 * b1 * 0xffff，因为0xffff 将其从您的注册表中删除。

(16bit) a * b = a0 * b1 * 0xff + a1 * b0 * 0xff + a0 * b0

此外，a0*b1和a1*b0项可以视为8位乘法，因为0xff：任何超过256的部分都将从寄存器中发送

到目前为止令人兴奋！ ......但是，现实引人注目：a0 * b0必须被视为16位乘法，因为你必须保留所有结果位。 a0必须保持在16位以允许移位左移。这个乘法有a * b次迭代的一半，部分 8bit（因为b0）但你仍然要考虑前面提到的2 8位乘法，最后的结果组成。 我们需要进一步重塑！

现在我收集b0。

(16bit) a * b = a0 * b1 * 0xff + b0 * (a0 + a1 * 0xff)

但是

(a0 + a1 * 0xff) = a

所以我们得到：

(16bit) a * b = a0 * b1 * 0xff + b0 * a

如果N是原始a * b的周期，则现在第一项是8位乘法，具有N / 2个周期，第二项是16位* 8位乘法，具有N / 2个周期。考虑M原始a*b中每次迭代的指令数，8位* 8位迭代具有一半指令，16位* 8位约为M的80％（与b相比，b0的一个移位指令更少）。综合起来，我们有N/2*M/2+N/2*M*0.8 = N*M*0.65复杂度，因此相对于原始N*M预期节省了~35％。声音很有希望。

这是代码：

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    while (b1) {///Maximum 8 cycles
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
        a0+=a0;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    while (b0) {///Maximum 8 cycles
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
        a+=a;
    }

    return res;
}

此外，理论上，在2个周期内的分裂应该是跳过某些周期的可能性的两倍：N / 2可能略微过高估计。

进一步的微小改进包括避免a变量的最后一次不必要的转变。小旁注：如果b0或b1为零，则会产生2条额外指令。但是它还保存了b0和b1的第一次检查，这是最昂贵的，因为它无法检查for循环条件跳转的移位操作的zero flag状态。

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    if ( (b1 & 1) )
        res1+=a0;
    b1>>=1;
    while (b1) {///Maximum 7 cycles
        a0+=a0;
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
    }

    uint16_t res = (uint16_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    if ( (b0 & 1) )
        res+=a;
    b0>>=1;
    while (b0) {///Maximum 7 cycles
        a+=a;
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
    }

    return res;
}

4。删除重复的班次

还有改进空间吗？是，因为a0中的字节移位两次。所以组合两个循环应该有好处。说服编译器完全按照我们的意愿行事可能有点棘手，特别是对于结果寄存器。

因此，我们会在同一周期b0和b1中处理。首先要处理的是循环退出条件？到目前为止，使用b0 / b1已清除状态非常方便，因为它避免使用计数器。此外，在右移之后，如果操作结果为零，则可能已经设置了一个标志，并且该标志可能允许条件跳转而无需进一步评估。

现在循环退出条件可能是(b0 || b1)的失败。然而，这可能需要昂贵的计算。一种解决方案是比较b0和b1并跳转到2个不同的代码部分：如果b1 > b0我在b1上测试条件，否则我在b0上测试条件。我更喜欢另一个解决方案，有2个循环，b0为零时的第一个退出，b1为零时的第二个退出。在某些情况下，我会在b1中进行零迭代。关键是在第二个循环中我知道b0为零，所以我可以减少执行的操作次数。

现在，让我们忘记退出条件并尝试加入上一节的2个循环。

uint16_t umul16_(uint16_t a, uint16_t b)
{
    uint16_t res = 0;

    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Swapping probably doesn't make much sense anymore
    if ( (b1 & 1) )
        res+=(uint16_t)((uint8_t)(a && 0xff))*256;
    //Hopefully the compiler understands it has simply to add the low 8bit register of a to the high 8bit register of res

    if ( (b0 & 1) )
        res+=a;

    b1>>=1;
    b0>>=1;
    while (b0) {///N cycles, maximum 7
        a+=a;
        if ( (b1 & 1) )
            res+=(uint16_t)((uint8_t)(a & 0xff))*256;
        if ( (b0 & 1) )
            res+=a;
        b1>>=1;
        b0>>=1; //I try to put as last the one that will leave the carry flag in the desired state
    }

    uint8_t a0 = a & 0xff; //Again, not a real copy but a register selection

    while (b1) {///P cycles, maximum 7 - N cycles
        a0+=a0;
        if ( (b1 & 1) )
            res+=(uint16_t) a0 * 256;
        b1>>=1;
    }
    return res;
}

感谢Sergio提供生成的程序集（-Ofast）。乍一看，考虑到代码中mov的大量sbrs //Tests a single bit in a register and skips the next instruction if the bit is set. Skip takes 2 clocks. ldi // Load immediate, 1 clock sbiw // Subtracts immediate to *word*, 2 clocks 00000010 <umul16_Antonio5>: 10: 70 ff sbrs r23, 0 12: 39 c0 rjmp .+114 ; 0x86 <__SREG__+0x47> 14: 41 e0 ldi r20, 0x01 ; 1 16: 00 97 sbiw r24, 0x00 ; 0 18: c9 f1 breq .+114 ; 0x8c <__SREG__+0x4d> 1a: 34 2f mov r19, r20 1c: 20 e0 ldi r18, 0x00 ; 0 1e: 60 ff sbrs r22, 0 20: 07 c0 rjmp .+14 ; 0x30 <umul16_Antonio5+0x20> 22: 28 0f add r18, r24 24: 39 1f adc r19, r25 26: 04 c0 rjmp .+8 ; 0x30 <umul16_Antonio5+0x20> 28: e4 2f mov r30, r20 2a: 45 2f mov r20, r21 2c: 2e 2f mov r18, r30 2e: 34 2f mov r19, r20 30: 76 95 lsr r23 32: 66 95 lsr r22 34: b9 f0 breq .+46 ; 0x64 <__SREG__+0x25> 36: 88 0f add r24, r24 38: 99 1f adc r25, r25 3a: 58 2f mov r21, r24 3c: 44 27 eor r20, r20 3e: 42 0f add r20, r18 40: 53 1f adc r21, r19 42: 70 ff sbrs r23, 0 44: 02 c0 rjmp .+4 ; 0x4a <__SREG__+0xb> 46: 24 2f mov r18, r20 48: 35 2f mov r19, r21 4a: 42 2f mov r20, r18 4c: 53 2f mov r21, r19 4e: 48 0f add r20, r24 50: 59 1f adc r21, r25 52: 60 fd sbrc r22, 0 54: e9 cf rjmp .-46 ; 0x28 <umul16_Antonio5+0x18> 56: e2 2f mov r30, r18 58: 43 2f mov r20, r19 5a: e8 cf rjmp .-48 ; 0x2c <umul16_Antonio5+0x1c> 5c: 95 2f mov r25, r21 5e: 24 2f mov r18, r20 60: 39 2f mov r19, r25 62: 76 95 lsr r23 64: 77 23 and r23, r23 66: 61 f0 breq .+24 ; 0x80 <__SREG__+0x41> 68: 88 0f add r24, r24 6a: 48 2f mov r20, r24 6c: 50 e0 ldi r21, 0x00 ; 0 6e: 54 2f mov r21, r20 70: 44 27 eor r20, r20 72: 42 0f add r20, r18 74: 53 1f adc r21, r19 76: 70 fd sbrc r23, 0 78: f1 cf rjmp .-30 ; 0x5c <__SREG__+0x1d> 7a: 42 2f mov r20, r18 7c: 93 2f mov r25, r19 7e: ef cf rjmp .-34 ; 0x5e <__SREG__+0x1f> 80: 82 2f mov r24, r18 82: 93 2f mov r25, r19 84: 08 95 ret 86: 20 e0 ldi r18, 0x00 ; 0 88: 30 e0 ldi r19, 0x00 ; 0 8a: c9 cf rjmp .-110 ; 0x1e <umul16_Antonio5+0xe> 8c: 40 e0 ldi r20, 0x00 ; 0 8e: c5 cf rjmp .-118 ; 0x1a <umul16_Antonio5+0xa>，似乎编译器没有解释，因为我想要他给他解释寄存器的提示。

输入为：r22，r23和r24,25 AVR指令集：Quick reference，Detailed documentation

//Input: a = a1 * 256 + a0, b = b1 * 256 + b0
//Output: r = r1 * 256 + r0

Preliminary:
P0 r0 = 0 (CLR)
P1 r1 = 0 (CLR)

Main block:
0 Shift right b0 (LSR)
1 If carry is not set skip 2 instructions = jump to 4 (BRCC)
2 r0 = r0 + a0 (ADD)
3 r1 = r1 + a1 + carry from prev. (ADC)
4 Shift right b1 (LSR)
5 If carry is not set skip 1 instruction = jump to 7 (BRCC)
6 r1 = r1 + a0 (ADD)
7 a0 = a0 + a0 (ADD)  
8 a1 = a1 + a1 + carry from prev. (ADC)

[Repeat same instructions for another 7 times]

5。展开循环：＆＃34;最佳＆＃34;装配

通过所有这些信息，让我们试着了解什么是最佳的＆＃34;考虑到架构限制的解决方案＆＃34;最优＆＃34;被引用是因为什么是最佳＆＃34;很大程度上取决于输入数据和我们想要优化的内容。 我们假设我们希望优化最坏情况下的周期数。如果我们选择最坏的情况，循环展开是一个合理的选择：我们知道我们有8个循环，我们删除所有测试以了解我们是否完成（如果b0和b1为零）。到目前为止，我们使用了技巧＆＃34;我们转移，我们检查零标志＆＃34;检查我们是否必须退出循环。删除了这个要求，我们可以使用不同的技巧：我们移位，并且我们检查进位位（我们在移位时从寄存器发出的位）以了解我是否应该更新结果即可。给定指令集，在汇编＆＃34;叙述＆＃34;代码说明如下。

8 * 9 + 2 - 2 =

如果没有跳转，分支需要1条指令，否则为2。所有其他指令都是1个周期。因此b1状态对循环次数没有影响，而如果b0 = 1则有9个循环，如果b0 = 0则有8个循环。计算初始化，8次迭代并跳过a0和a1的最后更新，在更坏的情况下（b0 = 11111111b），我们共有void iterate(uint8_t& b0,uint8_t& b1,uint16_t& a, uint16_t& r) { const uint8_t temp0 = b0; b0 >>=1; if (temp0 & 0x01) {//Will this convince him to use the carry flag? r += a; } const uint8_t temp1 = b1; b1 >>=1; if (temp1 & 0x01) { r+=(uint16_t)((uint8_t)(a & 0xff))*256; } a += a; } uint16_t umul16_(uint16_t a, uint16_t b) { uint16_t r = 0; uint8_t b0 = b & 0xff; uint8_t b1 = b >>8; iterate(b0,b1,a,r); iterate(b0,b1,a,r); iterate(b0,b1,a,r); iterate(b0,b1,a,r); iterate(b0,b1,a,r); iterate(b0,b1,a,r); iterate(b0,b1,a,r); iterate(b0,b1,a,r); //Hopefully he understands he doesn't need the last update for variable a return r; } 72个周期。我不知道哪个C ++实现会说服编译器生成它。也许：

Main block: 0 Test Nth bit of b0 (SBRS). If set jump to 2 (+ 1cycle) otherwise continue with 1 1 Jump to 4 (RJMP) 2 r0 = r0 + a0 (ADD) 3 r1 = r1 + a1 + carry from prev. (ADC) 4 Test Nth bit of (SBRC). If cleared jump to 6 (+ 1cycle) otherwise continue with 5 5 r1 = r1 + a0 (ADD) 6 a0 = a0 + a0 (ADD) 7 a1 = a1 + a1 + carry from prev. (ADC)

但是，鉴于之前的结果，要真正获得所需的代码，应该真正切换到汇编！

最后，我们还可以考虑对循环展开进行更极端的解释：sbrc / sbrs指令允许测试寄存器的特定位。因此，我们可以避免移位b0和b1，并且在每个周期检查不同的位。唯一的问题是这些指令只允许跳过下一条指令，而不是自定义跳转。所以，在＆＃34;叙事代码＆＃34;它看起来像这样：

template<uint8_t mask> void iterateWithMask(const uint8_t& b0,const uint8_t& b1, uint16_t& a, uint16_t& r) { if (b0 & mask) r += a; if (b1 & mask) r+=(uint16_t)((uint8_t)(a & 0xff))*256; a += a; } uint16_t umul16_(uint16_t a, const uint16_t b) { uint16_t r = 0; const uint8_t b0 = b & 0xff; const uint8_t b1 = b >>8; iterateWithMask<0x01>(b0,b1,a,r); iterateWithMask<0x02>(b0,b1,a,r); iterateWithMask<0x04>(b0,b1,a,r); iterateWithMask<0x08>(b0,b1,a,r); iterateWithMask<0x10>(b0,b1,a,r); iterateWithMask<0x20>(b0,b1,a,r); iterateWithMask<0x40>(b0,b1,a,r); iterateWithMask<0x80>(b0,b1,a,r); //Hopefully he understands he doesn't need the last update for a return r; }

虽然第二次替换允许保存1个周期，但在第二次替换中没有明显的优势。但是，我相信C ++代码可能更容易为编译器解释。考虑到8个周期，初始化和跳过a0和a1的最后更新，我们现在 64个周期。

C ++代码：

r+=(uint16_t)((uint8_t)(a & 0xff))*256;

请注意，在此实现中，0x01,0x02不是实数值，而只是提示编译器知道要测试哪个位。因此，无法通过向右移动获得掩码：与目前为止看到的所有其他函数不同，这实际上没有等效的循环版本。

一个大问题是

r

它应该只是a的较高寄存器与r+=(uint16_t) 256 *((uint8_t)(a & 0xff));的较低寄存器的总和。不按我的意愿解释。其他选择：

a

<小时/> 的 6。说服编译器提供最佳装配

我们也可以保持r不变，而改为结果b。在这种情况下，我们从最重要的位开始处理r。复杂性是等价的，但编译器可能更容易消化。此外，这次我们必须小心地明确写出最后一个循环，它不能为template<uint8_t mask> void inverseIterateWithMask(const uint8_t& b0,const uint8_t& b1,const uint16_t& a, const uint8_t& a0, uint16_t& r) { if (b0 & mask) r += a; if (b1 & mask) r+=(uint16_t)256*a0; //Hopefully easier to understand for the compiler? r += r; } uint16_t umul16_(const uint16_t a, const uint16_t b) { uint16_t r = 0; const uint8_t b0 = b & 0xff; const uint8_t b1 = b >>8; const uint8_t a0 = a & 0xff; inverseIterateWithMask<0x80>(b0,b1,a,r); inverseIterateWithMask<0x40>(b0,b1,a,r); inverseIterateWithMask<0x20>(b0,b1,a,r); inverseIterateWithMask<0x10>(b0,b1,a,r); inverseIterateWithMask<0x08>(b0,b1,a,r); inverseIterateWithMask<0x04>(b0,b1,a,r); inverseIterateWithMask<0x02>(b0,b1,a,r); //Last iteration: if (b0 & 0x01) r += a; if (b1 & 0x01) r+=(uint16_t)256*a0; return r; }进行进一步的右移。

{{1}}

Answer 2

嗯，LUT和shift的混合通常有效

沿线的东西，乘以8位实体。让我们考虑它们由两个四边形组成

uint4_t u1, l1, u2, l2;
uint8_t a = 16*u1 + l1;
uint8_t b = 16*u2 + l2;

product = 256*u1*u2 + 16*u1*l2 + 16*u2*l1 + l1*l1;

inline uint4_t hi( uint8_t v ) { return v >> 4; }
inline uint4_t lo( uint8_t v ) { return v & 15; }

inline uint8_t LUT( uint4_t x, uint4_t y ) {
    static uint8_t lut[256] = ...;
    return lut[x | y << 4]
}

uint16_t multiply(uint8_t a, uint8_t b) {
    return (uint16_t)LUT(hi(a), hi(b)) << 8 +
           ((uint16_t)LUT(hi(a), lo(b)) + (uint16_t)LUT(lo(a), hi(b)) << 4 +
           (uint16_t)LUT(lo(a), lo(b));
}

只需用乘法结果填充lut []。在您的情况下，取决于内存你可以使用四边形（256个大小的LUT）或字节（65536大小LUT）或其间的任何东西

Answer 3

一种方法是展开循环。我没有为您正在使用的平台编译器，因此我无法查看生成的代码，但这样的方法可能有所帮助。

此代码的性能与数据相关性较低 - 在最坏的情况下，如果不检查是否处于最佳状态，则会更快。代码大小有点大，但不是查找表的大小。

（注意代码未经测试，我的头脑。我很好奇生成的代码是什么样的！）

#define UMUL16_STEP(a, b, shift) \
    if ((b) & (1U << (shift))) result += ((a) << (shift)));

uint16_t umul16(uint16_t a, uint16_t b)
{
    uint16_t result = 0;

    UMUL16_STEP(a, b, 0);
    UMUL16_STEP(a, b, 1);
    UMUL16_STEP(a, b, 2);
    UMUL16_STEP(a, b, 3);
    UMUL16_STEP(a, b, 4);
    UMUL16_STEP(a, b, 5);
    UMUL16_STEP(a, b, 6);
    UMUL16_STEP(a, b, 7);
    UMUL16_STEP(a, b, 8);
    UMUL16_STEP(a, b, 9);
    UMUL16_STEP(a, b, 10);
    UMUL16_STEP(a, b, 11);
    UMUL16_STEP(a, b, 12);
    UMUL16_STEP(a, b, 13);
    UMUL16_STEP(a, b, 14);
    UMUL16_STEP(a, b, 15);

    return result;
}

更新

根据编译器的作用，UMUL16_STEP宏可以更改。另一种选择可能是：

#define UMUL16_STEP(a, b, shift) \
    if ((b) & (1U << (shift))) result += (a); (a) << 1;

使用这种方法，编译器可能能够使用sbrc指令来避免分支。

我猜测汇编器应该如何看待每一位，r0：r1是结果，r2：r3是a而r4：r5是b：

sbrc r4, 0
add r0, r2
sbrc r4, 0
addc r1, r3
lsl r2
rol r3

这应该在没有分支的情况下在恒定时间内执行。测试r4中的位，然后测试r5中的位以获得更高的8位。这应该根据我对指令集手册的读取，在96个周期内执行乘法。

Answer 4

一个非答案的tinyARM汇编程序（web doc）而不是C ++或C.我修改了一个非常通用multiply-by-squares-lookup的速度（＆lt; 50个循环，不包括调用和返回开销），代价是仅适用于具有不小于1KB的RAM的AVR，使用512个对齐的字节用于下半部分的表格。在20 MHz时，这将很好地满足2 max 3 usec时间限制仍未在正确的问题中出现 - 但Sergio Formiggini想要16 MHz。截至2015/04，只有来自Atmel的一个 ATtiny具有那么多RAM，并且指定高达8 MHz ......（滚动你的“拥有”（例如，来自OpenCores）你的FPGA可能有一堆快速乘法器（18×18位似乎很流行），如果不是处理器内核的话。）要快速转移和添加，请查看shift and add, factor shifting left, unrolled 16×16→16和/或对其进行改进（{{3}}）。（您可能会在问题中创建社区维基回答。）

.def    a0  = r16   ; factor low byte
.def    a1  = r17
#warning two warnings about preceding definitions of
#warning  r16 and r17 are due and may as well be ignored
.def    a   = r16   ; 8-bit factor
.def    b   = r17   ; 8-bit factor ; or r18, rather?
.def    b0  = r18   ; factor low byte
.def    b1  = r19
.def    p0  = r20   ; product low byte
.def    p1  = r21

; "squares table" SqTab shall be two 512 Byte tables of
;  squares of 9-bit natural numbers, divided by 4

; Idea: exploit p = a * b = Squares[a+b] - Squares[a-b]

init:
    ldi     r16, 0x73
    ldi     r17, 0xab
    ldi     r18, 23
    ldi     r19, 1
    ldi     r20, HIGH(SRAM_SIZE)
    cpi     r20, 2
    brsh    fillSqTable ; ATtiny 1634?
    rjmp    mpy16T16
fillSqTable:
    ldi     r20, SqTabH
    subi    r20, -2
    ldi     zh, SqTabH
    clr     zl
; generate sqares by adding up odd numbers starting at 1 += -1
    ldi     r22, 1
    clr     r23
    ser     r26
    ser     r27
fillLoop:
    add     r22, r26
    adc     r23, r27
    adiw    r26, 2
    mov     r21, r23
    lsr     r21         ; get bits 9:2
    mov     r21, r22
    ror     r21
    lsr     r21
    bst     r23, 1
    bld     r21, 7
    st      z+, r21
    cp      zh, r20
    brne    fillLoop
    rjmp    mpy16F16

; assembly lines are marked up with cycle count
;  and (latest) start cycle in block.
; If first line in code block, the (latest) block start cycle
;  follows; else if last line, the (max) block cycle total

;**************************************************************
;*
;* "mpy16F16" - 16x16->16 Bit Unsigned Multiplication
;*                        using table lookup
;* Sergio Formiggini special edition
;* Multiplies  two 16-bit register values a1:a0 and b1:b0.
;* The result is placed in p1:p0.
;*
;* Number of flash words: 318 + return = 
;*                       (40 + 256(flash table) + 22(RAM init))
;* Number of cycles     : 49 + return
;* Low  registers used  : None
;* High registers used  : 7+2 (a1:a0, b1:b0, p1:p0, sq;
;*                             + Z(r31:r30))
;* RAM bytes used       : 512 (squares table)
;*
;**************************************************************
mpy16F16:
    ldi     ZH, SqTabH>>1;1 0   0   squares table>>1
    mov     ZL, a0      ; 1 1
    add     ZL, b0      ; 1 2       a0+b0
    rol     ZH          ; 1 3       9 bit offset
    ld      p0, Z       ; 2 4       a0+b0l          1
    lpm     p1, Z       ; 3 6   9   a0+b0h          2

    ldi     ZH, SqTabH  ; 1 0   9   squares table

    mov     ZL, a1      ; 1 0   10
    sub     ZL, b0      ; 1 1       a1-b0
    brcc    noNegF10    ; 1 2
    neg     ZL          ; 1 3
noNegF10:
    ld      sq, Z       ; 2 4       a1-b0l          3
    sub     p1, sq      ; 1 6   7

    mov     ZL, a0      ; 1 0   17
    sub     ZL, b1      ; 1 1       a0-b1
    brcc    noNegF01    ; 1 2
    neg     ZL          ; 1 3
noNegF01:
    ld      sq, Z       ; 2 4       a0-b1l          4
    sub     p1, sq      ; 1 6   7

    mov     ZL, a0      ; 1 0   24
    sub     ZL, b0      ; 1 1       a0-b0
    brcc    noNegF00    ; 1 2
    neg     ZL          ; 1 3
noNegF00:
    ld      sq, Z       ; 2 4       a0-b0l          5
    sub     p0, sq      ; 1 6
    lpm     sq, Z       ; 3 7       a0-b0h          6*
    sbc     p1, sq      ; 1 10  11

    ldi     ZH, SqTabH>>1;1 0   35
    mov     ZL, a1      ; 1 1
    add     ZL, b0      ; 1 2       a1+b0
    rol     ZH          ; 1 3
    ld      sq, Z       ; 2 4       a1+b0l          7
    add     p1, sq      ; 1 6   7

    ldi     ZH, SqTabH>>1;1 0   42
    mov     ZL, a0      ; 1 1
    add     ZL, b1      ; 1 2       a0+b1
    rol     ZH          ; 1 3
    ld      sq, Z       ; 2 4       a0+b1l          8
    add     p1, sq      ; 1 6   7

    ret                 ;       49

.CSEG
.org 256; words?!
SqTableH:
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   0,   0,   0,   0,   0,   0,   0,   0
.db   0,   0,   1,   1,   1,   1,   1,   1,   1,   1
.db   1,   1,   1,   1,   1,   1,   2,   2,   2,   2
.db   2,   2,   2,   2,   2,   2,   3,   3,   3,   3
.db   3,   3,   3,   3,   4,   4,   4,   4,   4,   4
.db   4,   4,   5,   5,   5,   5,   5,   5,   5,   6
.db   6,   6,   6,   6,   6,   7,   7,   7,   7,   7
.db   7,   8,   8,   8,   8,   8,   9,   9,   9,   9
.db   9,   9,  10,  10,  10,  10,  10,  11,  11,  11
.db  11,  12,  12,  12,  12,  12,  13,  13,  13,  13
.db  14,  14,  14,  14,  15,  15,  15,  15,  16,  16
.db  16,  16,  17,  17,  17,  17,  18,  18,  18,  18
.db  19,  19,  19,  19,  20,  20,  20,  21,  21,  21
.db  21,  22,  22,  22,  23,  23,  23,  24,  24,  24
.db  25,  25,  25,  25,  26,  26,  26,  27,  27,  27
.db  28,  28,  28,  29,  29,  29,  30,  30,  30,  31
.db  31,  31,  32,  32,  33,  33,  33,  34,  34,  34
.db  35,  35,  36,  36,  36,  37,  37,  37,  38,  38
.db  39,  39,  39,  40,  40,  41,  41,  41,  42,  42
.db  43,  43,  43,  44,  44,  45,  45,  45,  46,  46
.db  47,  47,  48,  48,  49,  49,  49,  50,  50,  51
.db  51,  52,  52,  53,  53,  53,  54,  54,  55,  55
.db  56,  56,  57,  57,  58,  58,  59,  59,  60,  60
.db  61,  61,  62,  62,  63,  63,  64,  64,  65,  65
.db  66,  66,  67,  67,  68,  68,  69,  69,  70,  70
.db  71,  71,  72,  72,  73,  73,  74,  74,  75,  76
.db  76,  77,  77,  78,  78,  79,  79,  80,  81,  81
.db  82,  82,  83,  83,  84,  84,  85,  86,  86,  87
.db  87,  88,  89,  89,  90,  90,  91,  92,  92,  93
.db  93,  94,  95,  95,  96,  96,  97,  98,  98,  99
.db 100, 100, 101, 101, 102, 103, 103, 104, 105, 105
.db 106, 106, 107, 108, 108, 109, 110, 110, 111, 112
.db 112, 113, 114, 114, 115, 116, 116, 117, 118, 118
.db 119, 120, 121, 121, 122, 123, 123, 124, 125, 125
.db 126, 127, 127, 128, 129, 130, 130, 131, 132, 132
.db 133, 134, 135, 135, 136, 137, 138, 138, 139, 140
.db 141, 141, 142, 143, 144, 144, 145, 146, 147, 147
.db 148, 149, 150, 150, 151, 152, 153, 153, 154, 155
.db 156, 157, 157, 158, 159, 160, 160, 161, 162, 163
.db 164, 164, 165, 166, 167, 168, 169, 169, 170, 171
.db 172, 173, 173, 174, 175, 176, 177, 178, 178, 179
.db 180, 181, 182, 183, 183, 184, 185, 186, 187, 188
.db 189, 189, 190, 191, 192, 193, 194, 195, 196, 196
.db 197, 198, 199, 200, 201, 202, 203, 203, 204, 205
.db 206, 207, 208, 209, 210, 211, 212, 212, 213, 214
.db 215, 216, 217, 218, 219, 220, 221, 222, 223, 224
.db 225, 225, 226, 227, 228, 229, 230, 231, 232, 233
.db 234, 235, 236, 237, 238, 239, 240, 241, 242, 243
.db 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
.db 254, 255
; word addresses, again?!
.equ SqTabH = (high(SqTableH) << 1)

.DSEG
RAMTab .BYTE 512

Answer 5

最后一个答案，如果是一个厚颜无耻的答案：我还没有（还）从GCC获得AVR-C编译器使其适合8K代码。（有关汇编程序的再现，请参阅AVR multiplication: No Holds Barred）这种方法是每个使用Duff设备的人都尝试过第二次尝试：
使用开关。使用宏，源代码看起来完全无害，如果按摩：

#define low(mp)     case mp: p = a0 * (uint8_t)(mp) << 8; break
#define low4(mp)    low(mp); low(mp + 1); low(mp + 2); low(mp + 3)
#define low16(mp)   low4(mp); low4(mp + 4); low4(mp + 8); low4(mp + 12)
#define low64(mp)   low16(mp); low16(mp + 16); low16(mp + 32); low16(mp + 48)
#if preShift
# define CASE(mp)   case mp: return p + a * (mp)
#else
# define CASE(mp)   case mp: return (p0<<8) + a * (mp)
#endif
#define case4(mp)   CASE(mp); CASE(mp + 1); CASE(mp + 2); CASE(mp + 3)
#define case16(mp)  case4(mp); case4(mp + 4); case4(mp + 8); case4(mp + 12)
#define case64(mp)  case16(mp); case16(mp + 16); case16(mp + 32); case16(mp + 48)

extern "C" __attribute__ ((noinline))
 uint16_t mpy16NHB16(uint16_t a, uint16_t b)
{
    uint16_t p = 0;
    uint8_t b0 = (uint8_t)b, b1 = (uint8_t)(b>>8);
    uint8_t a0 = (uint8_t)a, p0;

    switch (b1) {
        case64(0);
        case64(64);
        case64(128);
        case64(192);
    }
#if preShift
    p = p0 << 8;
#endif
#if preliminaries
    if (0 == b0) {
        p = -a;
        if (b & 0x8000)
            p += a << 9;
        if (b & 0x4000)
            p += a << 8;
        return p;
    }
    while (b0 & 1) {
        a <<= 1;
        b0 >>= 1;
    }
#endif
    switch (b0) {
        low64(0);
        low64(64);
        low64(128);
        low64(192);
    }
    return ~0;
}
int main(int ac, char const *const av[])
{
    char buf[22];
    for (uint16_t a = 0 ; a < a+1 ; a++)
      for (uint16_t m = 0 ; m <= a ; m++)
        puts(itoa(//shift4(ac)+shift3MaskAdd((uint16_t)av[0], ac)
    //      +shift4Add(ac, (uint16_t)av[0])
    //           + mpy16NHB16(ac, (ac + 105))
                 mpy16NHB16(a, m), buf, 10));
}

更快的16位乘法算法，适用于8位MCU

5 个答案: