我有两种代码迭代大小为500的矢量设计。其中一种设计包含64位双精度数组,第二种设计使用包含32位整数的数组。我期待32位设计更快,因为可以在缓存中打包更多有用的数据。
编译器MSVC,CPU Ivy Bridge,编译64位模式。
这是代码1,使用32位整数(在 2600 CPU周期中运行):
#include <vector>
#include <iostream>
int main(){
std::vector<unsigned int> x1;
std::vector<unsigned int> x2;
std::vector<unsigned int> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtsc();
for(int i=0; i < 500; i++){
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp();
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
产生这个asm(-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r10,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov r8,rbx
mov r9d,1F4h
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
mov edx,dword ptr [r8+r15]
m = m + (a * g);
mov ecx,edx
imul ecx,dword ptr [r8+r14]
xorps xmm0,xmm0
cvtsi2sd xmm0,rcx
addsd xmm7,xmm0
n = n + (b * g);
imul edx,dword ptr [r8]
mov eax,edx
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm8,xmm0
for(int i=0; i < 500; i++){
add r8,4
dec r9
jne main+0E5h (013F681261h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
这是代码2,使用64位双精度数(代码在 2000 CPU周期中运行):
#include <vector>
#include <iostream>
int main(){
std::vector<double> x1;
std::vector<double> x2;
std::vector<unsigned long long> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtscp(&p);
for(int i=0; i < 500; i++){
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp(&q);
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
这里是asm(-Os)产生的:
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r9,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov rdx,rbx
mov r8d,1F4h
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
mov rcx,qword ptr [rdx+r15]
xorps xmm1,xmm1
m = m + (a * g);
cvtsi2sd xmm1,rcx
test rcx,rcx
jns main+120h (013F32129Ch)
addsd xmm1,xmm9
movaps xmm0,xmm1
mulsd xmm0,mmword ptr [rdx+r14]
addsd xmm6,xmm0
n = n + (b * g);
mulsd xmm1,mmword ptr [rdx]
addsd xmm7,xmm1
for(int i=0; i < 500; i++){
add rdx,8
dec r8
jne main+10Ah (013F321286h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
答案 0 :(得分:6)
不同之处在于第一个代码中的整数转换为双精度(向量包含unsigned int
,产品采用整数运算,但积累使用double
,在汇编程序中这会添加{{ 1}}指令代码。)
在第二个代码中,您在任何地方都使用双打,因此您无法进行转换,代码运行得更快。
对于在固定点和浮点处理单元之间有更严格区别的CPU而言,这种差异会更加明显(POWER平台就是一个例子)。在这方面,X86平台非常宽容。