Question

我用atmega16创建了一个程序，并且正在尝试创建自己的delay_us()，所以我查看了_delay_us()函数的编译器avr-gcc库，这是它的代码：

static inline void _delay_us(double __us) __attribute__((always_inline));

/*
\ingroup util_delay
Perform a delay of \c __us microseconds, using _delay_loop_1().

The macro F_CPU is supposed to be defined to a
constant defining the CPU clock frequency (in Hertz).

The maximal possible delay is 768 us / F_CPU in MHz.

If the user requests a delay greater than the maximal possible one,
_delay_us() will automatically call _delay_ms() instead.  The user
will not be informed about this case.
*/
 void
 _delay_us(double __us)
 {
 uint8_t __ticks;
double __tmp = ((F_CPU) / 3e6) * __us; //number of ticks per us * delay time in us
if (__tmp < 1.0)
    __ticks = 1;
else if (__tmp > 255)
{
    _delay_ms(__us / 1000.0);
    return;
}
else
    __ticks = (uint8_t)__tmp;
_delay_loop_1(__ticks); // function decrements ticks untill it reaches 0( takes 3 cycles)
}

我很困惑，如果我使用1Mhz时钟，这个包含浮点运算的函数将能够产生小延迟（如_delay_us(10)），因为执行所有设置代码肯定会花费更多时间比那个。所以我写了这个程序：

#include <avr/io.h>
#include <avr/interrupt.h>
#include <util/delay.h>

#define F_CPU 1000000UL

int main()
{
     _delay_ms(1000);
    DDRB=0XFF;
    PORTB=0XFF;
    _delay_us(10);
    PORTB=0;

    for(;;){}
    return 0;
}

我使用protues模拟它并使用示波器并将PORTB引脚之一连接到其输入。然后我看到延迟恰好是10美元。考虑到使用浮点运算的这个设置代码语句，延迟怎么可能是准确的：

    double __tmp = ((F_CPU) / 4e3) * __ms;

这应该花费很多周期，使_delay_us(10)超过10 us时期，但时间恰好是10us !!

Answer 1

所有浮点运算都由预处理器计算，因为这些延迟函数实际上是宏。所以在mcu执行代码的时候，剩下的就是一个使用整数来做延迟的循环。

Answer 2

typedef unsigned char uint8_t;
#define F_CPU 16000000

extern void _delay_loop_1( uint8_t );

static void _delay_us(double __us)
{
    uint8_t __ticks;

    double __tmp = ((F_CPU) / 3e6) * __us; 
    if (__tmp < 1.0)
    {
        __ticks = 1;
    }
    else
    {
        if (__tmp > 255)
        {
            _delay_ms(__us / 1000.0);
            return;
        }
        else
        {
            __ticks = (uint8_t)__tmp;
        }
    }
    _delay_loop_1(__ticks); 
}


void fun1 ( void )
{
    _delay_us(10);
}

用gcc可以产生这个：

00000000 <fun1>:
   0:   85 e3           ldi r24, 0x35   ; 53
   2:   00 c0           rjmp    .+0         ; 0x4 <__zero_reg__+0x3>

要提供_delay_loop_1的数字是在编译时而非运行时计算的，所有死代码都会消失。

但添加：

void fun2 ( void )
{
    uint8_t ra;
    for(ra=1;ra<10;ra++) _delay_us(ra);
}

事情发生了巨大变化。

00000000 <fun1>:
   0:   85 e3           ldi r24, 0x35   ; 53
   2:   00 c0           rjmp    .+0         ; 0x4 <fun2>

00000004 <fun2>:
   4:   8f 92           push    r8
   6:   9f 92           push    r9
   8:   af 92           push    r10
   a:   bf 92           push    r11
   c:   cf 92           push    r12
   e:   df 92           push    r13
  10:   ef 92           push    r14
  12:   ff 92           push    r15
  14:   cf 93           push    r28
  16:   c1 e0           ldi r28, 0x01   ; 1
  18:   6c 2f           mov r22, r28
  1a:   70 e0           ldi r23, 0x00   ; 0
  1c:   80 e0           ldi r24, 0x00   ; 0
  1e:   90 e0           ldi r25, 0x00   ; 0
  20:   00 d0           rcall   .+0         ; 0x22 <fun2+0x1e>
  22:   86 2e           mov r8, r22
  24:   97 2e           mov r9, r23
  26:   a8 2e           mov r10, r24
  28:   b9 2e           mov r11, r25
  2a:   2b ea           ldi r18, 0xAB   ; 171
  2c:   3a ea           ldi r19, 0xAA   ; 170
  2e:   4a ea           ldi r20, 0xAA   ; 170
  30:   50 e4           ldi r21, 0x40   ; 64
  32:   00 d0           rcall   .+0         ; 0x34 <fun2+0x30>
  34:   c6 2e           mov r12, r22
  36:   d7 2e           mov r13, r23
  38:   e8 2e           mov r14, r24
  3a:   f9 2e           mov r15, r25
  3c:   20 e0           ldi r18, 0x00   ; 0
  3e:   30 e0           ldi r19, 0x00   ; 0
  40:   40 e8           ldi r20, 0x80   ; 128
  42:   5f e3           ldi r21, 0x3F   ; 63
  44:   00 d0           rcall   .+0         ; 0x46 <fun2+0x42>
  46:   87 fd           sbrc    r24, 7
  48:   00 c0           rjmp    .+0         ; 0x4a <fun2+0x46>
  4a:   20 e0           ldi r18, 0x00   ; 0
  4c:   30 e0           ldi r19, 0x00   ; 0
  4e:   4f e7           ldi r20, 0x7F   ; 127
  50:   53 e4           ldi r21, 0x43   ; 67
  52:   9f 2d           mov r25, r15
  54:   8e 2d           mov r24, r14
  56:   7d 2d           mov r23, r13
  58:   6c 2d           mov r22, r12
  5a:   00 d0           rcall   .+0         ; 0x5c <fun2+0x58>
  5c:   18 16           cp  r1, r24
  5e:   04 f0           brlt    .+0         ; 0x60 <fun2+0x5c>
  60:   9f 2d           mov r25, r15
  62:   8e 2d           mov r24, r14
  64:   7d 2d           mov r23, r13
  66:   6c 2d           mov r22, r12
  68:   00 d0           rcall   .+0         ; 0x6a <fun2+0x66>
  6a:   86 2f           mov r24, r22
  6c:   00 d0           rcall   .+0         ; 0x6e <fun2+0x6a>
  6e:   cf 5f           subi    r28, 0xFF   ; 255
  70:   ca 30           cpi r28, 0x0A   ; 10
  72:   01 f4           brne    .+0         ; 0x74 <fun2+0x70>
  74:   cf 91           pop r28
  76:   ff 90           pop r15
  78:   ef 90           pop r14
  7a:   df 90           pop r13
  7c:   cf 90           pop r12
  7e:   bf 90           pop r11
  80:   af 90           pop r10
  82:   9f 90           pop r9
  84:   8f 90           pop r8
  86:   08 95           ret
  88:   20 e0           ldi r18, 0x00   ; 0
  8a:   30 e0           ldi r19, 0x00   ; 0
  8c:   4a e7           ldi r20, 0x7A   ; 122
  8e:   54 e4           ldi r21, 0x44   ; 68
  90:   9b 2d           mov r25, r11
  92:   8a 2d           mov r24, r10
  94:   79 2d           mov r23, r9
  96:   68 2d           mov r22, r8
  98:   00 d0           rcall   .+0         ; 0x9a <fun2+0x96>
  9a:   00 d0           rcall   .+0         ; 0x9c <fun2+0x98>
  9c:   00 c0           rjmp    .+0         ; 0x9e <fun2+0x9a>
  9e:   81 e0           ldi r24, 0x01   ; 1
  a0:   00 c0           rjmp    .+0         ; 0xa2 <__SREG__+0x63>

嗯，优化器有多好？

void fun3 ( void )
{
    uint8_t ra;
    for(ra=20;ra<22;ra++) _delay_us(ra);
}

这么认为

00000004 <fun3>:
   4:   8a e6           ldi r24, 0x6A   ; 106
   6:   00 d0           rcall   .+0         ; 0x8 <fun3+0x4>
   8:   80 e7           ldi r24, 0x70   ; 112
   a:   00 c0           rjmp    .+0         ; 0xc <fun3+0x8>

认为计数到10就可以了。

很多时候你会看到像这样的延迟循环函数与硬编码值一起使用，因为无论你有什么位置敲打等的规格都有这些值，当事情说弹出复位然后等待100us时很容易，你只需要拨打100延迟。现在，如果有一个文件：

fun4(10);

和另外一个文件（另一个优化域），你已经添加了以上内容：

void fun4 ( uint8_t x)
{
    _delay_us(x);
}

然后你就可以了解它的发展方向......运行时...甚至不需要编译它以确定它会有问题。现在像llvm这样的一些编译器可以跨文件域进行优化，但是它们并不针对AVR，他们的MSP430是一个宣传噱头而不是现实，因为它不起作用而且不受支持。他们的手臂支撑显然很好，但是他们几乎每次发布都会改变他们的命令行选项，我很久以来就厌倦了尝试使用它们，因为我必须经常更改我的makefile才能跟上，而且他们优化的代码很遗憾和gccs一样快，尽管gccs代码每次发布都会变得更糟，并且llvm变得更好（当然，更糟糕/更好的是在旁观者眼中）。

_delay_us（）;时间解释

2 个答案: