Question

我编写了C代码 - 如下所示 - 以获得某些功能的基准。这个基准测试的主要目的是在AVR AT Tiny85上测试这些功能，但也在PC上测试（现在我使用的是atmega168而不是AT Tiny85 - 但几乎相同）。

此基准测试为其必须测试的每个函数执行大量循环，并为“void”函数执行，该函数接收要测试的函数的相同参数，但仅执行返回。在每个函数的循环结束时，它会写一个标签和一个用usec表示的时间。该时间是标签指定的函数的循环持续时间。

我可能会认为，如果我从要测试的函数的基准时间中减去“void”函数的基准时间，并将结果除以循环次数，我就有足够的有关持续时间的信息。要测试的功能。但这不是真的，因为中断（甚至只是测量时间）。

无论如何，我认为这个基准测试能够指示我最快的功能。你怎么看？你有什么建议吗？

这是一个输出示例：

void 2110168
norm 2121500
base 2337196
basl 2450964
basw 2333980
ant4 2235236
ant5 2242904
unro 2270484
unrl 2590444
vect 2754188
vesw 2732472

可以看到标签和函数之间的链接，查看基准代码中的查找表“static fntest_t fnt”。

我在下面报告的代码可以在PC上使用GCC 64bit（32位进行一些修改，因为警告）或使用Arduino环境/ avr-gcc进行AVR编译。

以下是基准代码。我在代码中使用的类型test_t在umul.h文件中是“typedefined”为uint16_t（此typedef的目的是轻松更改函数管理/返回的值的类型，但现在只能用于少数几个那些！）

#ifdef __AVR__
#include <Arduino.h>
#include <HardwareSerial.h>
#include <string.h>
#else
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/time.h>
#include "timefn.h"
#endif

#include "umul.h"

#ifndef UNUSED
#define UNUSED(x) (void)(x)
#endif

typedef test_t fn_t(test_t a,test_t b);

typedef struct fntest_s {
    fn_t * fn;
    char * msg;
} fntest_t;

test_t nullfn(test_t a,test_t b);
#ifndef __AVR__
uint32_t micros();
#endif

static fntest_t fnt[]={
    {nullfn,(char *)"void"},
    {umul16_normal,(char *)"normal"},
    {umul16_base,(char *)"base"},
    {umul16_baseandlogic,(char *)"basl"},
    {umul16_baseswap,(char *)"basw"},
    {umul16_Antonio4,(char *)"ant4"},
    {umul16_Antonio5,(char *)"ant5"},
    {umul16_unrolled,(char *)"unro"},
    {umul16_unrolledandlogic,(char *)"unrl"},
    {umul16_vect,(char *)"vect"},
    {umul16_vectswap,(char *)"vesw"}
};

#ifndef __AVR__
uint32_t micros()
{
    struct timeval t;
    gettimeofday(&t,NULL);

    return (t.tv_sec*1000000UL+t.tv_usec);
}
#endif

test_t nullfn(test_t a,test_t b)
{
    UNUSED(a);UNUSED(b);
    return 0;
}

test_t umultry()
{
#ifdef __AVR__
#define RUNS 20000
static char strbuf[50];
#else
#define RUNS 10000000
#endif

    unsigned int i,j,k;
    uint32_t x;

    test_t ix,iy;
    static test_t z[16];

    for(j=0;j<5;j++) {
        for(k=0;k<sizeof(fnt)/sizeof(fntest_t);k++) {
            x=micros();srand(x);
            for(i=0;i<RUNS;i++) {
                ix=rand();iy=rand();
                z[i&0xF]+=fnt[k].fn(ix,iy);
            }
            x=micros()-x;
#ifdef __AVR__
            sprintf(strbuf,"%s %lu\n",fnt[k].msg, x);
            Serial.print(strbuf);
#else
            printf("%s %u\n",fnt[k].msg, x);
#endif
        }
        for(i=0;i<16;i++) {
            z[0]+=z[i]; /* To avoid warn unused and the optimizations don't evaluate z[]*/
        }

#ifdef __AVR__
        Serial.println("----------------");
#else
        puts("----------------");
#endif
    }

    return z[0];
}

#ifdef __AVR__
void setup()
{
    Serial.begin(115200);
    Serial.println(F("Starting..."));
}

void loop()
{
    umultry();
    for(;;);
}
#else
int main(void)
{
    puts("Starting...");
    return umultry();
}
#endif

以下是要测试的功能：

#include "umul.h"

test_t umul16_normal(test_t a, test_t b)
{
    return a*b;
}

test_t umul16_unrolled(test_t a, test_t b)
{
test_t result=0;

#define UMUL16_STEP(a, b, shift) \
    if ((b) & (1U << (shift))) result += (a<<shift);

    UMUL16_STEP(a, b, 0);
    UMUL16_STEP(a, b, 1);
    UMUL16_STEP(a, b, 2);
    UMUL16_STEP(a, b, 3);
    UMUL16_STEP(a, b, 4);
    UMUL16_STEP(a, b, 5);
    UMUL16_STEP(a, b, 6);
    UMUL16_STEP(a, b, 7);
    UMUL16_STEP(a, b, 8);
    UMUL16_STEP(a, b, 9);
    UMUL16_STEP(a, b, 10);
    UMUL16_STEP(a, b, 11);
    UMUL16_STEP(a, b, 12);
    UMUL16_STEP(a, b, 13);
    UMUL16_STEP(a, b, 14);
    UMUL16_STEP(a, b, 15);

    return result;
#undef UMUL16_STEP
}

test_t umul16_unrolledandlogic(test_t a, test_t b)
{
test_t result=0;
#define UMUL16_STEP(a, b, shift) \
/*    if ((b) & (1U << (shift))) result += (a<<shift);*/\
    result+=  ((0 - !(!((b&(1U<<(shift)))))) & (a<<(shift)));

    UMUL16_STEP(a, b, 0);
    UMUL16_STEP(a, b, 1);
    UMUL16_STEP(a, b, 2);
    UMUL16_STEP(a, b, 3);
    UMUL16_STEP(a, b, 4);
    UMUL16_STEP(a, b, 5);
    UMUL16_STEP(a, b, 6);
    UMUL16_STEP(a, b, 7);
    UMUL16_STEP(a, b, 8);
    UMUL16_STEP(a, b, 9);
    UMUL16_STEP(a, b, 10);
    UMUL16_STEP(a, b, 11);
    UMUL16_STEP(a, b, 12);
    UMUL16_STEP(a, b, 13);
    UMUL16_STEP(a, b, 14);
    UMUL16_STEP(a, b, 15);

    return result;
#undef UMUL16_STEP
}

test_t umul16_Antonio5(test_t a, test_t b)
{
    test_t res = 0;

    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Swapping probably doesn't make much sense anymore
    if ( (b1 & 1) )
        res+=(test_t)((uint8_t)(a && 0xff))*256;
    //Hopefully the compiler understands it has simply to add the low 8bit register of a to the high 8bit register of res

    if ( (b0 & 1) )
        res+=a;

    b1>>=1;
    b0>>=1;
    while (b0) {///N cycles, maximum 7
        a+=a;
        if ( (b1 & 1) )
            res+=(test_t)((uint8_t)(a & 0xff))*256;
        if ( (b0 & 1) )
            res+=a;
        b1>>=1;
        b0>>=1; //I try to put as last the one that will leave the carry flag in the desired state
    }

    uint8_t a0 = a & 0xff; //Again, not a real copy but a register selection

    while (b1) {///P cycles, maximum 7 - N cycles
        a0+=a0;
        if ( (b1 & 1) )
            res+=(test_t) a0 * 256;
        b1>>=1;
    }
    return res;
}

test_t umul16_base(test_t a, test_t b)
{
    test_t res=0;

    while (b) {
        if ( (b & 1) )
            res+=a;
        b>>=1;
        a+=a;
    }

    return res;
}

test_t umul16_baseandlogic(test_t a, test_t b)
{
    test_t res=0;

    while (b) {
        //if ( (b & 1) )
        //    res+=a;
        res+=  ((0 - !(!(b&1))) & a);
        b>>=1;
        a+=a;
    }

    return res;
}

test_t umul16_baseswap(test_t a, test_t b)
{
    test_t res;

    if (a<b) {
        res=a;
        a=b;
        b=res;
    }

    res=0;
    while (b) {
        if ( (b & 1) )
            res+=a;
        b>>=1;
        a+=a;
    }

    return res;
}

test_t umul16_Antonio4(test_t a, test_t b)
{
    uint8_t res1 = 0;

    uint8_t a0 = a & 0xff; //This effectively needs to copy the data
    uint8_t b0 = b & 0xff; //This should be optimized away
    uint8_t b1 = b >>8; //This should be optimized away

    //Here a0 and b1 could be swapped (to have b1 < a0)
    if ( (b1 & 1) )
        res1+=a0;
    b1>>=1;
    while (b1) {///Maximum 7 cycles
        a0+=a0;
        if ( (b1 & 1) )
            res1+=a0;
        b1>>=1;
    }

    test_t res = (test_t) res1 * 256; //Should be optimized away, it's not even a copy!

    //Here swapping wouldn't make much sense
    if ( (b0 & 1) )
        res+=a;
    b0>>=1;
    while (b0) {///Maximum 7 cycles
        a+=a;
        if ( (b0 & 1) )
            res+=a;
        b0>>=1;
    }

    return res;
}

test_t umul16_vect(test_t a, test_t b)
{
    test_t c[2];

    c[0]=0;c[1]=a;a=0;
    while (b) {
        a+=c[(b & 1)];
        b>>=1;
        c[1]+=c[1];
    }

    return a;
}

test_t umul16_vectswap(test_t a, test_t b)
{
    test_t c[2];

    if (a<b) {
        c[1]=b;
        b=a;
        a=c[1];
    }

    c[0]=0;c[1]=a;a=0;
    while (b) {
        a+=c[(b & 1)];
        b>>=1;
        c[1]+=c[1];
    }

    return a;
}

test_t udiv_(test_t n,test_t d, test_t *r)
{
    test_t q = 0,i,r_;

    r_=0;
    if (d == 0) return (test_t)-1U; //error

    i= ( (test_t)(1) << ((sizeof(n)<<3)-1) );
    for (;i!=0;i>>=1) {
        r_ <<= 1;

        if (n&i)
            r_ |= 1;

        if (r_ >= d) {
            r_ -= d;
            q |= i;
        }
    }
    if (r!=NULL)
        *r=r_;

    return q;
}

以下是要测试的函数的包含文件umul.h：

#ifndef __UMUL_H
#define __UMUL_H

#ifdef __AVR_ATtiny85__
typedef signed int int8_t __attribute__((__mode__(__QI__)));
typedef unsigned int uint8_t __attribute__((__mode__(__QI__)));
typedef signed int int16_t __attribute__ ((__mode__ (__HI__)));
typedef unsigned int uint16_t __attribute__ ((__mode__ (__HI__)));
typedef signed int int32_t __attribute__ ((__mode__ (__SI__)));
typedef unsigned int uint32_t __attribute__ ((__mode__ (__SI__)));
typedef signed int int64_t __attribute__((__mode__(__DI__)));
typedef unsigned int uint64_t __attribute__((__mode__(__DI__)));

#define NULL 0
#else
#include <stdlib.h>
#include <stdint.h>
#endif

typedef uint16_t test_t;

#ifdef __cplusplus
extern "C" {
#endif

test_t umul16_normal(test_t a, test_t b);
test_t umul16_unrolled(test_t a, test_t b);
test_t umul16_unrolledandlogic(test_t a, test_t b);
test_t umul16_Antonio5(test_t a, test_t b);
test_t umul16_base(test_t a, test_t b);
test_t umul16_baseswap(test_t a, test_t b);
test_t umul16_Antonio4(test_t a, test_t b);
test_t umul16_vect(test_t a, test_t b);
test_t umul16_vectswap(test_t a, test_t b);
test_t umul16_baseandlogic(test_t a, test_t b);
    test_t udiv_(test_t n,test_t d, test_t *r);
} // extern "C"
#endif

#endif

Answer 1

通常为了消除中断的影响，您只需重复测试几次，并保持最快的响应作为测量。

x86等复杂CPU也需要重复删除对当前缓存内容和分支预测统计信息的依赖。

在现代CPU上确保时钟是固定的非常重要（大多数现代CPU会自动调制时钟，以便在CPU大部分空闲时减少加热/消耗，并且可能需要一些时间让时钟控制逻辑得到回到全速。）

Answer 2

我看到了一些改进基准测试代码的机会。

首先，我会在开头生成随机输入数据，并将其存储在某个缓冲区中。有两个原因：首先，您可以将相同的数据提供给所有算法，其次，您不会使用随机生成例程污染您的计时。

对于srand，您甚至可以对种子编号（例如0）进行硬编码，以使您的数据在连续的基准测试中保持完全相同。

您还应该检查更改测试功能的顺序是否会改变您获得的时间。

顺便说一句，我认为您还应该检查所有功能是否正确，例如如果他们在相同的输入下给出相同的结果。

您设置了哪些优化标志？我确认你的null函数被优化掉了，可能会放一个return a-b;或类似的东西。

如果我理解正确并且z只是一个虚拟变量，可能以下它根本不相关。

我相信你应该为z添加一个初始化，即：

   for(i=0;i<16;i++) {
        z[i] = 0;
    }

下面：

   for(i=0;i<16;i++) {
        z[0]+=z[i]; /* To avoid warn unused and the optimizations don't evaluate z[]*/
    }

应该是：

   for(i=1;i<16;i++) {
        z[0]+=z[i]; /* To avoid warn unused and the optimizations don't evaluate z[]*/
    }

否则，您在最终结果中总计z[0]次。

基准测试功能

2 个答案: