我修改了a previous experiment(Agner Fog' s Optimizing Assembly, example 12.10a)中的代码,使其更加依赖:
_id
现在每次迭代需要大约13个周期,但我不知道为什么这么多。 请帮我理解。
(更新) 对不起是的,确定@Peter Cordes是对的 - 实际上每次迭代需要9个周期。误会是由我自己引起的。我错过了两个相似的代码片段(指令交换),13个周期的代码在这里:
movsd xmm2, [x]
movsd xmm1, [one]
xorps xmm0, xmm0
mov eax, coeff
L1:
movsd xmm3, [eax]
mulsd xmm3, xmm1
mulsd xmm1, xmm2
addsd xmm1, xmm3
add eax, 8
cmp eax, coeff_end
jb L1
答案 0 :(得分:2)
对于我来说,它在Core2 E6600上以每9c的一次迭代运行,这是预期的:
movsd xmm3, [eax] ; independent, depends only on eax
A: mulsd xmm3, xmm1 ; 5c: depends on xmm1:C from last iteration
B: mulsd xmm1, xmm2 ; 5c: depends on xmm1:C from last iteration
C: addsd xmm1, xmm3 ; 3c: depends on xmm1:B from THIS iteration (and xmm3:A from this iteration)
当xmm1:C从迭代i
准备就绪时,下一次迭代可以开始计算:
无论哪个先运行,都必须在C
运行之前完成。所以循环携带的依赖链是5 + 3个周期,+ 1c表示资源冲突,它会阻止从同一个周期开始的两个乘法。
当数组为8B * 128 * 1024时,这会减慢到每~11c一次迭代。如果您使用更大的数组进行测试而不是使用围绕您发布的重复循环进行测试,那么这就是您的原因看到更高的延迟。
如果负载迟到,CPU就无法“赶上”,因为它会延迟循环携带的依赖链。如果仅在从循环链中分叉的依赖链中需要负载,那么管道可以更容易地吸收偶尔的慢负载。因此,某些循环可能比其他循环对内存延迟更敏感。
default REL
%macro IACA_start 0
mov ebx, 111
db 0x64, 0x67, 0x90
%endmacro
%macro IACA_end 0
mov ebx, 222
db 0x64, 0x67, 0x90
%endmacro
global _start
_start:
movsd xmm2, [x]
movsd xmm1, [one]
xorps xmm0, xmm0
mov ecx, 10000
outer_loop:
mov eax, coeff
IACA_start ; outside the loop
ALIGN 32 ; this matters on Core2, .78 insn per cycle vs. 0.63 without
L1:
movsd xmm3, [eax]
mulsd xmm3, xmm1
mulsd xmm1, xmm2
addsd xmm1, xmm3
add eax, 8
cmp eax, coeff_end
jb L1
IACA_end
dec ecx
jnz outer_loop
;mov eax, 1
;int 0x80 ; exit() for 32bit code
xor edi, edi
mov eax, 231 ; exit_group(0). __NR_exit = 60.
syscall
section .data
x:
one: dq 1.0
section .bss
coeff: resq 24*1024 ; 6 * L1 size. Doesn't run any faster when it fits in L1 (resb)
coeff_end:
$ asm-link interiteration-test.asm
+ yasm -felf64 -Worphan-labels -gdwarf2 interiteration-test.asm
+ ld -o interiteration-test interiteration-test.o
$ perf stat ./interiteration-test
Performance counter stats for './interiteration-test':
928.543744 task-clock (msec) # 0.995 CPUs utilized
152 context-switches # 0.164 K/sec
1 cpu-migrations # 0.001 K/sec
52 page-faults # 0.056 K/sec
2,222,536,634 cycles # 2.394 GHz (50.14%)
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
1,723,575,954 instructions # 0.78 insns per cycle (75.06%)
246,414,304 branches # 265.377 M/sec (75.16%)
51,483 branch-misses # 0.02% of all branches (74.74%)
0.933372495 seconds time elapsed
每个分支/每7个指令是内循环的一次迭代。
$ bc -l
bc 1.06.95
1723575954 / 7
246225136.28571428571428571428
# ~= number of branches: good
2222536634 / .
9.026
# cycles per iteration
(不计算nop
中的ALIGN
):
$ iaca.sh -arch IVB interiteration-test
Intel(R) Architecture Code Analyzer Version - 2.1
Analyzed File - interiteration-test
Binary Format - 64Bit
Architecture - IVB
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 9.00 Cycles Throughput Bottleneck: InterIteration
Port Binding In Cycles Per Iteration:
-------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 |
-------------------------------------------------------------------------
| Cycles | 2.0 0.0 | 1.0 | 0.5 0.5 | 0.5 0.5 | 0.0 | 2.0 |
-------------------------------------------------------------------------
N - port number or number of cycles resource conflict caused delay, DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3), CP - on a critical path
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion happened
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256 instruction, dozens of cycles penalty is expected
! - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | |
---------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | movsd xmm3, qword ptr [eax]
| 1 | 1.0 | | | | | | | mulsd xmm3, xmm1
| 1 | 1.0 | | | | | | CP | mulsd xmm1, xmm2
| 1 | | 1.0 | | | | | CP | addsd xmm1, xmm3
| 1 | | | | | | 1.0 | | add eax, 0x8
| 1 | | | | | | 1.0 | | cmp eax, 0x63011c
| 0F | | | | | | | | jb 0xffffffffffffffe7
Total Num Of Uops: 6
答案 1 :(得分:1)
根据我上面评论中建议的addsd
更改, - &gt; addsd xmm0,xmm3
,此可以编码为使用全宽度寄存器,性能提升两倍。
松散:
对于ones
的初始值,它必须是:
double ones[2] = { 1.0, x }
我们需要将x
替换为x2
:
double x2[2] = { x * x, x * x }
如果存在奇数个系数,则用零填充它以产生偶数个系数。
并且,将指针增量更改为16。
以下是我得到的测试结果。我进行了多次试验,并采用了最佳时间并通过100次迭代延长时间。 std是C版本,dbl是你的版本,qed是“宽”版本:
R=1463870188
C=100
T=100
I=100
x: 3.467957099973322e+00 3.467957099973322e+00
one: 1.000000000000000e+00 3.467957099973322e+00
x2: 1.202672644725538e+01 1.202672644725538e+01
std: 2.803772098439484e+56 (ELAP: 0.000019312)
dbl: 2.803772098439484e+56 (ELAP: 0.000019312)
qed: 2.803772098439492e+56 (ELAP: 0.000009060)
rtn_loop: 2.179378907910304e+55 2.585834207648461e+56
rtn_shuf: 2.585834207648461e+56 2.179378907910304e+55
rtn_add: 2.803772098439492e+56 2.585834207648461e+56
这是在i7 920 @ 2.67 GHz上完成的。
我认为如果你把经过的数字转换成它们,你会发现你的版本比你想象的要快。
我提前道歉,因为我很难让汇编程序以其他方式工作,因此切换到AT&amp; T语法。再次,抱歉。另外,我正在使用linux,所以我使用rdi rsi
寄存器来传递系数指针。如果你在Windows上,ABI是不同的,你将不得不调整它。
我做了C版本并将其拆开。它几乎与你的代码相同,只是它重新排列了非xmm指令,我在下面添加了。
我相信我发布了所有文件,所以如果你愿意,你可以想象在你的系统上运行它。
这是原始代码:
# xmmloop/dbl.s -- implement using single double
.globl dbl
# dbl -- compute result using single double
#
# arguments:
# rdi -- pointer to coeff vector
# rsi -- pointer to coeff vector end
dbl:
movsd x(%rip),%xmm2 # get x value
movsd one(%rip),%xmm1 # get ones
xorps %xmm0,%xmm0 # sum = 0
dbl_loop:
movsd (%rdi),%xmm3 # c[i]
add $8,%rdi # increment to next vector element
cmp %rsi,%rdi # done yet?
mulsd %xmm1,%xmm3 # c[i]*x^i
mulsd %xmm2,%xmm1 # x^(i+1)
addsd %xmm3,%xmm0 # sum += c[i]*x^i
jb dbl_loop # no, loop
retq
以下代码已更改为使用movapd
等。人:
# xmmloop/qed.s -- implement using single double
.globl qed
# qed -- compute result using single double
#
# arguments:
# rdi -- pointer to coeff vector
# rsi -- pointer to coeff vector end
qed:
movapd x2(%rip),%xmm2 # get x^2 value
movapd one(%rip),%xmm1 # get [1,x]
xorpd %xmm4,%xmm4 # sum = 0
qed_loop:
movapd (%rdi),%xmm3 # c[i]
add $16,%rdi # increment to next coefficient
cmp %rsi,%rdi # done yet?
mulpd %xmm1,%xmm3 # c[i]*x^i
mulpd %xmm2,%xmm1 # x^(i+2)
addpd %xmm3,%xmm4 # sum += c[i]*x^i
jb qed_loop # no, loop
movapd %xmm4,rtn_loop(%rip) # save intermediate DEBUG
movapd %xmm4,%xmm0 # get lower sum
shufpd $1,%xmm4,%xmm4 # get upper value into lower half
movapd %xmm4,rtn_shuf(%rip) # save intermediate DEBUG
addsd %xmm4,%xmm0 # add upper sum to lower
movapd %xmm0,rtn_add(%rip) # save intermediate DEBUG
retq
这是代码的C版本:
// xmmloop/std -- compute result using C code
#include <xmmloop.h>
// std -- compute result using C
double
std(const double *cur,const double *ep)
{
double xt;
double xn;
double ci;
double sum;
xt = x[0];
xn = one[0];
sum = 0;
for (; cur < ep; ++cur) {
ci = *cur; // get c[i]
ci *= xn; // c[i]*x^i
xn *= xt; // x^(i+1)
sum += ci; // sum += c[i]*x^i
}
return sum;
}
这是我使用的测试程序:
// xmmloop/xmmloop -- test program
#define _XMMLOOP_GLO_
#include <xmmloop.h>
// tvget -- get high precision time
double
tvget(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_REALTIME,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
// timeit -- get best time
void
timeit(fnc_p proc,double *cofptr,double *cofend,const char *tag)
{
double tvbest;
double tvbeg;
double tvdif;
double sum;
sum = 0;
tvbest = 1e9;
for (int trycnt = 1; trycnt <= opt_T; ++trycnt) {
tvbeg = tvget();
for (int iter = 1; iter <= opt_I; ++iter)
sum = proc(cofptr,cofend);
tvdif = tvget();
tvdif -= tvbeg;
if (tvdif < tvbest)
tvbest = tvdif;
}
printf("%s: %.15e (ELAP: %.9f)\n",tag,sum,tvbest);
}
// main -- main program
int
main(int argc,char **argv)
{
char *cp;
double *cofptr;
double *cofend;
double *cur;
double val;
long rseed;
int cnt;
--argc;
++argv;
rseed = 0;
cnt = 0;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'C':
cp += 2;
cnt = strtol(cp,&cp,10);
break;
case 'R':
cp += 2;
rseed = strtol(cp,&cp,10);
break;
case 'T':
cp += 2;
opt_T = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
case 'I':
cp += 2;
opt_I = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
}
}
if (rseed == 0)
rseed = time(NULL);
srand48(rseed);
printf("R=%ld\n",rseed);
if (cnt == 0)
cnt = 100;
if (cnt & 1)
++cnt;
printf("C=%d\n",cnt);
if (opt_T == 0)
opt_T = 100;
printf("T=%d\n",opt_T);
if (opt_I == 0)
opt_I = 100;
printf("I=%d\n",opt_I);
cofptr = malloc(sizeof(double) * cnt);
cofend = &cofptr[cnt];
val = drand48();
for (; val < 3; val += 1.0);
x[0] = val;
x[1] = val;
DMP(x);
one[0] = 1.0;
one[1] = val;
DMP(one);
val *= val;
x2[0] = val;
x2[1] = val;
DMP(x2);
for (cur = cofptr; cur < cofend; ++cur) {
val = drand48();
val *= 1e3;
*cur = val;
}
timeit(std,cofptr,cofend,"std");
timeit(dbl,cofptr,cofend,"dbl");
timeit(qed,cofptr,cofend,"qed");
DMP(rtn_loop);
DMP(rtn_shuf);
DMP(rtn_add);
return 0;
}
头文件:
// xmmloop/xmmloop.h -- common control
#ifndef _xmmloop_xmmloop_h_
#define _xmmloop_xmmloop_h_
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef _XMMLOOP_GLO_
#define EXTRN_XMMLOOP /**/
#else
#define EXTRN_XMMLOOP extern
#endif
#define XMMALIGN __attribute__((aligned(16)))
EXTRN_XMMLOOP int opt_T;
EXTRN_XMMLOOP int opt_I;
EXTRN_XMMLOOP double x[2] XMMALIGN;
EXTRN_XMMLOOP double x2[2] XMMALIGN;
EXTRN_XMMLOOP double one[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_loop[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_shuf[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_add[2] XMMALIGN;
#define DMP(_sym) \
printf(#_sym ": %.15e %.15e\n",_sym[0],_sym[1]);
typedef double (*fnc_p)(const double *cofptr,const double *cofend);
double std(const double *cofptr,const double *cofend);
double dbl(const double *cofptr,const double *cofend);
double qed(const double *cofptr,const double *cofend);
#endif