Question

我搜索并使用了许多方法来测量经过的时间。为此目的有很多问题。例如，this问题非常好，但是当你需要一个准确的时间记录器时，我找不到一个好的方法。为此，我想在这里分享我的方法以便在出现问题时予以纠正。

更新＆amp;注意：此问题适用于基准测试，不到1纳秒。它与使用clock_gettime(CLOCK_MONOTONIC,&start);完全不同，它记录时间超过一纳秒。

UPDATE：衡量加速的常用方法是重复应该进行基准测试的程序部分。但是，正如评论中所提到的，当研究人员依赖自动向量化时，它可能会显示出不同的优化。

注意它不够精确，无法衡量一次重复的经过时间。在某些情况下，我的结果显示该部分必须重复超过1K或1M才能获得最短的时间。

建议：我不熟悉shell编程（只知道一些基本命令......）但是，有可能测量最小的时间而不在程序内重复。

MY CURRENT SOLUTION 为了防止分支，我使用宏#define REP_CODE(X) X X X... X X重复ode部分，其中X是我想要基准的代码部分，如下所示：

//numbers
#define FMAX1 MAX1*MAX1
#define COEFF 8 
int __attribute__(( aligned(32))) input[FMAX1+COEFF];           //= {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17};
int __attribute__(( aligned(32))) output[FMAX1];
int __attribute__(( aligned(32))) coeff[COEFF] = {1,2,3,4,5,6,7,8};//= {1,1,1,1,1,1,1,1};//;            //= {1,2,1,2,1,2,1,2,2,1};

int main()
{
    REP_CODE(
        t1_rdtsc=_rdtsc();
        //Code
        for(i = 0; i < FMAX1; i++){
            for(j = 0; j < COEFF; j++){//IACA_START
                output[i] += coeff[j] * input[i+j]; 

            }//IACA_END
        }
        t2_rdtsc=_rdtsc();
        ttotal_rdtsc[ii++]=t2_rdtsc-t1_rdtsc;
        )
    // The smallest element in `ttotal_rdtsc` is the answer
}

这不会影响优化，但也会受到代码大小的限制，并且在某些情况下编译时间过长。

有任何建议和更正吗？

提前致谢。

Answer 1

如果您对自动向量发生器有疑问并希望限制它，只需在.highlight { box-shadow: 3px 10px 100px yellow; }之后添加asm("#somthing");即可将begin_rdtsc循环分开。我刚刚检查过，它将你发布的代码矢量化，自动矢量化器无法对其进行矢量化。我改变了你的宏，你可以使用它....

do-while

Answer 2

我已经开发了第一个答案并得到了这个解决方案。但是，我仍然想要一个解决方案。因为准确地测量时间并且影响最小是非常重要的。我将此部分放在头文件中并将其包含在主程序文件中。

//Header file header.h
#define count 1000 // number of repetition 
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[count], ttbest_rdtsc = 99999999999999999, elapsed,  elapsed_rdtsc=count, overal_time = OVERAL_TIME, ttime=0;
int ii=0;
#define begin_rdtsc\
                    do{\
                        t1_rdtsc=_rdtsc();

#define end_rdtsc\
                        t2_rdtsc=_rdtsc();\
                        ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
                    }while (ii++<count);\   
                    for(ii=0; ii<do_while; ii++){\
                        if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
                            ttbest_rdtsc = ttotal_rdtsc[ii];}}\             
                    printf("\nthe best is %lld in %lldth iteration \n", ttbest_rdtsc, elapsed_rdtsc);

//Main program
#include "header.h"
.
.
.
int main()
{
    //before the section
    begin_rdtsc
       //put your code here to measure the clocks.
    end_rdtsc
    return 0
}

Answer 3

我建议将此方法用于x86微架构。

注意：

NUM_LOOP应该是一个有助于提高准确性的数字重复你的代码来记录最佳时间

ttbest_rdtsc必须比我推荐的最大化时间更大。

我使用（你可能不需要它）OVERAL_TIME作为另一个检查规则，因为我将它用于许多内核，在某些情况下NUM_LOOP非常大而且我没有想要改变它。我计划OVERAL_TIME限制迭代并在特定时间后停止。

更新：整个程序是：

#include <stdio.h> #include <x86intrin.h> #define NUM_LOOP 100 //executes your code NUM_LOOP times to get the smalest time to avoid overheads such as cache misses, etc. int main() { long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999; int do_while = 0; do{ t1_rdtsc = _rdtsc(); //put your code here t2_rdtsc = _rdtsc(); ttotal_rdtsc = t2_rdtsc - t1_rdtsc; //store the smalest time: if (ttotal_rdtsc<ttbest_rdtsc) ttbest_rdtsc = ttotal_rdtsc; }while (do_while++ < NUM_LOOP); printf("\nthe best is %lld in %d repetitions\n", ttbest_rdtsc, NUM_LOOP ); return 0; }

我已更改为此并添加到我自己的标题中，然后我可以在我的程序中使用它。

#include <x86intrin.h> #define do_while NUM_LOOP #define OVERAL_TIME 999999999 long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999, elapsed, elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0; #define begin_rdtsc\ do{\ t1_rdtsc=_rdtsc(); #define end_rdtsc\ t2_rdtsc=_rdtsc();\ ttotal_rdtsc=t2_rdtsc-t1_rdtsc;\ if (ttotal_rdtsc<ttbest_rdtsc){\ ttbest_rdtsc = ttotal_rdtsc;\ elapsed=(do_while-elapsed_rdtsc);}\ ttime+=ttotal_rdtsc;\ }while (elapsed_rdtsc-- && (ttime<overal_time));\ printf("\nthe best is %lld in %lldth iteration and %lld repetitions\n", ttbest_rdtsc, elapsed, (do_while-elapsed_rdtsc));

如何使用此方法？嗯，这很简单！

int main() { //before the section begin_rdtsc //put your code here to measure the clocks. end_rdtsc return 0 }

要有创意，您可以更改它以衡量程序中的加速等。输出的一个例子是：

the best is 9600 in 384751th iteration and 569179 repetitions

我测试的代码得到9600时钟，384751end次迭代中记录的最好，我的代码已经过测试569179次

我在GCC和Clang上测试了它们。

如何测量x86下纳秒以下的时间？

3 个答案: