Question

我修改了a previous experiment（Agner Fog＆＃39; s Optimizing Assembly, example 12.10a）中的代码，使其更加依赖：

_id

现在每次迭代需要大约13个周期，但我不知道为什么这么多。请帮我理解。

（更新） 对不起是的，确定@Peter Cordes是对的 - 实际上每次迭代需要9个周期。误会是由我自己引起的。我错过了两个相似的代码片段（指令交换），13个周期的代码在这里：

movsd xmm2, [x] 
movsd xmm1, [one] 
xorps xmm0, xmm0  
mov eax, coeff    

L1:
    movsd xmm3, [eax]
    mulsd xmm3, xmm1
    mulsd xmm1, xmm2
    addsd xmm1, xmm3
    add   eax, 8
    cmp eax, coeff_end
    jb L1

Answer 1

对于我来说，它在Core2 E6600上以每9c的一次迭代运行，这是预期的：

      movsd xmm3, [eax] ; independent, depends only on eax

A:    mulsd xmm3, xmm1  ; 5c: depends on xmm1:C from last iteration
B:    mulsd xmm1, xmm2  ; 5c: depends on xmm1:C from last iteration
C:    addsd xmm1, xmm3  ; 3c: depends on xmm1:B from THIS iteration (and xmm3:A from this iteration)

当xmm1：C从迭代i准备就绪时，下一次迭代可以开始计算：

A：在5c中生成xmm3：A
B：在5c中产生xmm1：B（但是存在资源冲突;这些乘法不能同时在Core2或IvyBridge中开始，只有Haswell及以后）

无论哪个先运行，都必须在C运行之前完成。所以循环携带的依赖链是5 + 3个周期，+ 1c表示资源冲突，它会阻止从同一个周期开始的两个乘法。

以预期速度运行的测试代码：

当数组为8B * 128 * 1024时，这会减慢到每~11c一次迭代。如果您使用更大的数组进行测试而不是使用围绕您发布的重复循环进行测试，那么这就是您的原因看到更高的延迟。

如果负载迟到，CPU就无法“赶上”，因为它会延迟循环携带的依赖链。如果仅在从循环链中分叉的依赖链中需要负载，那么管道可以更容易地吸收偶尔的慢负载。因此，某些循环可能比其他循环对内存延迟更敏感。

        default REL
%macro  IACA_start 0
     mov ebx, 111
     db 0x64, 0x67, 0x90
%endmacro
%macro  IACA_end 0
     mov ebx, 222
     db 0x64, 0x67, 0x90
%endmacro

global _start
_start:
        movsd   xmm2, [x]
        movsd   xmm1, [one]
        xorps   xmm0, xmm0
        mov     ecx, 10000

outer_loop:
        mov     eax, coeff
IACA_start                      ; outside the loop
ALIGN 32                        ; this matters on Core2, .78 insn per cycle vs. 0.63 without
L1:
        movsd   xmm3, [eax]
        mulsd   xmm3, xmm1
        mulsd   xmm1, xmm2
        addsd   xmm1, xmm3
        add     eax, 8
        cmp     eax, coeff_end
        jb      L1
IACA_end
        dec     ecx
        jnz     outer_loop

        ;mov    eax, 1
        ;int    0x80            ; exit() for 32bit code
        xor     edi, edi
        mov     eax, 231        ;  exit_group(0).  __NR_exit = 60.
        syscall


        section .data
x:
one:    dq 1.0

        section .bss
coeff:  resq 24*1024        ; 6 * L1 size.  Doesn't run any faster when it fits in L1 (resb)
coeff_end:

实验测试

$ asm-link interiteration-test.asm
+ yasm -felf64 -Worphan-labels -gdwarf2 interiteration-test.asm
+ ld -o interiteration-test interiteration-test.o

$ perf stat ./interiteration-test

 Performance counter stats for './interiteration-test':
    928.543744      task-clock (msec)         #    0.995 CPUs utilized          
           152      context-switches          #    0.164 K/sec                  
             1      cpu-migrations            #    0.001 K/sec                  
            52      page-faults               #    0.056 K/sec                  
 2,222,536,634      cycles                    #    2.394 GHz                      (50.14%)
 <not supported>      stalled-cycles-frontend  
 <not supported>      stalled-cycles-backend   
 1,723,575,954      instructions              #    0.78  insns per cycle          (75.06%)
   246,414,304      branches                  #  265.377 M/sec                    (75.16%)
        51,483      branch-misses             #    0.02% of all branches          (74.74%)

   0.933372495 seconds time elapsed

每个分支/每7个指令是内循环的一次迭代。

$ bc -l
bc 1.06.95
1723575954 / 7
246225136.28571428571428571428
# ~= number of branches: good
2222536634 / .
9.026
# cycles per iteration

IACA同意：IvB每次迭代9c

（不计算nop中的ALIGN）：

$ iaca.sh -arch IVB interiteration-test
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - interiteration-test
Binary Format - 64Bit
Architecture  - IVB
Analysis Type - Throughput

Throughput Analysis Report
--------------------------
Block Throughput: 9.00 Cycles       Throughput Bottleneck: InterIteration

Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
|  Port  |  0   -  DV  |  1   |  2   -  D   |  3   -  D   |  4   |  5   |
-------------------------------------------------------------------------
| Cycles | 2.0    0.0  | 1.0  | 0.5    0.5  | 0.5    0.5  | 0.0  | 2.0  |
-------------------------------------------------------------------------

N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           |     | 0.5   0.5 | 0.5   0.5 |     |     |    | movsd xmm3, qword ptr [eax]
|   1    | 1.0       |     |           |           |     |     |    | mulsd xmm3, xmm1
|   1    | 1.0       |     |           |           |     |     | CP | mulsd xmm1, xmm2
|   1    |           | 1.0 |           |           |     |     | CP | addsd xmm1, xmm3
|   1    |           |     |           |           |     | 1.0 |    | add eax, 0x8
|   1    |           |     |           |           |     | 1.0 |    | cmp eax, 0x63011c
|   0F   |           |     |           |           |     |     |    | jb 0xffffffffffffffe7
Total Num Of Uops: 6

Answer 2

根据我上面评论中建议的addsd更改， - ＆gt; addsd xmm0,xmm3，此可以编码为使用全宽度寄存器，性能提升两倍。

松散：

对于ones的初始值，它必须是：

double ones[2] = { 1.0, x }

我们需要将x替换为x2：

double x2[2] = { x * x, x * x }

如果存在奇数个系数，则用零填充它以产生偶数个系数。

并且，将指针增量更改为16。

以下是我得到的测试结果。我进行了多次试验，并采用了最佳时间并通过100次迭代延长时间。 std是C版本，dbl是你的版本，qed是“宽”版本：

R=1463870188
C=100
T=100
I=100
x: 3.467957099973322e+00 3.467957099973322e+00
one: 1.000000000000000e+00 3.467957099973322e+00
x2: 1.202672644725538e+01 1.202672644725538e+01
std: 2.803772098439484e+56 (ELAP: 0.000019312)
dbl: 2.803772098439484e+56 (ELAP: 0.000019312)
qed: 2.803772098439492e+56 (ELAP: 0.000009060)
rtn_loop: 2.179378907910304e+55 2.585834207648461e+56
rtn_shuf: 2.585834207648461e+56 2.179378907910304e+55
rtn_add: 2.803772098439492e+56 2.585834207648461e+56

这是在i7 920 @ 2.67 GHz上完成的。

我认为如果你把经过的数字转换成它们，你会发现你的版本比你想象的要快。

我提前道歉，因为我很难让汇编程序以其他方式工作，因此切换到AT＆amp; T语法。再次，抱歉。另外，我正在使用linux，所以我使用rdi rsi寄存器来传递系数指针。如果你在Windows上，ABI是不同的，你将不得不调整它。

我做了C版本并将其拆开。它几乎与你的代码相同，只是它重新排列了非xmm指令，我在下面添加了。

我相信我发布了所有文件，所以如果你愿意，你可以想象在你的系统上运行它。

这是原始代码：

# xmmloop/dbl.s -- implement using single double

    .globl  dbl
# dbl -- compute result using single double
#
# arguments:
#   rdi -- pointer to coeff vector
#   rsi -- pointer to coeff vector end
dbl:
    movsd   x(%rip),%xmm2           # get x value
    movsd   one(%rip),%xmm1         # get ones
    xorps   %xmm0,%xmm0             # sum = 0

dbl_loop:
    movsd   (%rdi),%xmm3            # c[i]

    add     $8,%rdi                 # increment to next vector element
    cmp     %rsi,%rdi               # done yet?

    mulsd   %xmm1,%xmm3             # c[i]*x^i
    mulsd   %xmm2,%xmm1             # x^(i+1)
    addsd   %xmm3,%xmm0             # sum += c[i]*x^i

    jb      dbl_loop                # no, loop

    retq

以下代码已更改为使用movapd等。人：

# xmmloop/qed.s -- implement using single double

    .globl  qed
# qed -- compute result using single double
#
# arguments:
#   rdi -- pointer to coeff vector
#   rsi -- pointer to coeff vector end
qed:
    movapd  x2(%rip),%xmm2          # get x^2 value
    movapd  one(%rip),%xmm1         # get [1,x]
    xorpd   %xmm4,%xmm4             # sum = 0

qed_loop:
    movapd  (%rdi),%xmm3            # c[i]

    add     $16,%rdi                # increment to next coefficient
    cmp     %rsi,%rdi               # done yet?

    mulpd   %xmm1,%xmm3             # c[i]*x^i
    mulpd   %xmm2,%xmm1             # x^(i+2)
    addpd   %xmm3,%xmm4             # sum += c[i]*x^i

    jb      qed_loop                # no, loop

    movapd  %xmm4,rtn_loop(%rip)    # save intermediate DEBUG
    movapd  %xmm4,%xmm0             # get lower sum
    shufpd  $1,%xmm4,%xmm4          # get upper value into lower half
    movapd  %xmm4,rtn_shuf(%rip)    # save intermediate DEBUG
    addsd   %xmm4,%xmm0             # add upper sum to lower
    movapd  %xmm0,rtn_add(%rip)     # save intermediate DEBUG

    retq

这是代码的C版本：

// xmmloop/std -- compute result using C code

#include <xmmloop.h>

// std -- compute result using C
double
std(const double *cur,const double *ep)
{
    double xt;
    double xn;
    double ci;
    double sum;

    xt = x[0];
    xn = one[0];
    sum = 0;

    for (;  cur < ep;  ++cur) {
        ci = *cur;                  // get c[i]
        ci *= xn;                   // c[i]*x^i
        xn *= xt;                   // x^(i+1)
        sum += ci;                  // sum += c[i]*x^i
    }

    return sum;
}

这是我使用的测试程序：

// xmmloop/xmmloop -- test program

#define _XMMLOOP_GLO_
#include <xmmloop.h>

// tvget -- get high precision time
double
tvget(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_REALTIME,&ts);

    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    return sec;
}

// timeit -- get best time
void
timeit(fnc_p proc,double *cofptr,double *cofend,const char *tag)
{
    double tvbest;
    double tvbeg;
    double tvdif;
    double sum;

    sum = 0;

    tvbest = 1e9;

    for (int trycnt = 1;  trycnt <= opt_T;  ++trycnt) {
        tvbeg = tvget();

        for (int iter = 1;  iter <= opt_I;  ++iter)
            sum = proc(cofptr,cofend);

        tvdif = tvget();
        tvdif -= tvbeg;

        if (tvdif < tvbest)
            tvbest = tvdif;
    }

    printf("%s: %.15e (ELAP: %.9f)\n",tag,sum,tvbest);
}

// main -- main program
int
main(int argc,char **argv)
{
    char *cp;
    double *cofptr;
    double *cofend;
    double *cur;
    double val;
    long rseed;
    int cnt;

    --argc;
    ++argv;

    rseed = 0;
    cnt = 0;

    for (;  argc > 0;  --argc, ++argv) {
        cp = *argv;
        if (*cp != '-')
            break;

        switch (cp[1]) {
        case 'C':
            cp += 2;
            cnt = strtol(cp,&cp,10);
            break;

        case 'R':
            cp += 2;
            rseed = strtol(cp,&cp,10);
            break;

        case 'T':
            cp += 2;
            opt_T = (*cp != 0) ? strtol(cp,&cp,10) : 1;
            break;

        case 'I':
            cp += 2;
            opt_I = (*cp != 0) ? strtol(cp,&cp,10) : 1;
            break;
        }
    }

    if (rseed == 0)
        rseed = time(NULL);
    srand48(rseed);
    printf("R=%ld\n",rseed);

    if (cnt == 0)
        cnt = 100;
    if (cnt & 1)
        ++cnt;
    printf("C=%d\n",cnt);

    if (opt_T == 0)
        opt_T = 100;
    printf("T=%d\n",opt_T);

    if (opt_I == 0)
        opt_I = 100;
    printf("I=%d\n",opt_I);

    cofptr = malloc(sizeof(double) * cnt);
    cofend = &cofptr[cnt];

    val = drand48();
    for (;  val < 3;  val += 1.0);

    x[0] = val;
    x[1] = val;
    DMP(x);

    one[0] = 1.0;
    one[1] = val;
    DMP(one);

    val *= val;
    x2[0] = val;
    x2[1] = val;
    DMP(x2);

    for (cur = cofptr;  cur < cofend;  ++cur) {
        val = drand48();
        val *= 1e3;
        *cur = val;
    }

    timeit(std,cofptr,cofend,"std");
    timeit(dbl,cofptr,cofend,"dbl");
    timeit(qed,cofptr,cofend,"qed");
    DMP(rtn_loop);
    DMP(rtn_shuf);
    DMP(rtn_add);

    return 0;
}

头文件：

// xmmloop/xmmloop.h -- common control

#ifndef _xmmloop_xmmloop_h_
#define _xmmloop_xmmloop_h_

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#ifdef _XMMLOOP_GLO_
#define EXTRN_XMMLOOP       /**/
#else
#define EXTRN_XMMLOOP       extern
#endif

#define XMMALIGN            __attribute__((aligned(16)))

EXTRN_XMMLOOP int opt_T;
EXTRN_XMMLOOP int opt_I;

EXTRN_XMMLOOP double x[2] XMMALIGN;
EXTRN_XMMLOOP double x2[2] XMMALIGN;
EXTRN_XMMLOOP double one[2] XMMALIGN;

EXTRN_XMMLOOP double rtn_loop[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_shuf[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_add[2] XMMALIGN;

#define DMP(_sym) \
    printf(#_sym ": %.15e %.15e\n",_sym[0],_sym[1]);

typedef double (*fnc_p)(const double *cofptr,const double *cofend);
double std(const double *cofptr,const double *cofend);
double dbl(const double *cofptr,const double *cofend);
double qed(const double *cofptr,const double *cofend);

#endif

x86：长循环携带的依赖链。为什么13个周期？

2 个答案:

以预期速度运行的测试代码：

实验测试

IACA同意：IvB每次迭代9c