通过查看n-body仿真的以下实现,我注意到C ++版本的速度是Java版本的两倍。我想了解这种性能差异的影响因素。可以在此处找到各种编程语言的n体仿真基准:
http://benchmarksgame.alioth.debian.org/u64q/nbody.html
这两个程序都是计算机语言基准游戏的一部分,可以在这里找到:
http://benchmarksgame.alioth.debian.org/
下面首先给出C ++版本。它是使用以下编译器标志编译的:/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=native -mfpmath=sse -msse3 --std=c++11 -fopenmp
/* The Computer Language Benchmarks Game
http://benchmarksgame.alioth.debian.org/
contributed by Mark C. Lewis
modified slightly by Chad Whipkey
converted from java to c++,added sse support, by Branimir Maksimovic
converted from c++ to c, by Alexey Medvedchikov
converted from c to c++11, by Walter Landry
modified by Dmitri Naumov
*/
#include <algorithm>
#include <stdio.h>
#include <cmath>
#include <stdlib.h>
#include <immintrin.h>
#include <array>
constexpr double PI(3.141592653589793);
constexpr double SOLAR_MASS ( 4 * PI * PI );
constexpr double DAYS_PER_YEAR(365.24);
struct body {
double x[3], fill, v[3], mass;
constexpr body(double x0, double x1, double x2, double v0, double v1, double v2, double Mass):
x{x0,x1,x2}, fill(0), v{v0,v1,v2}, mass(Mass) {}
};
class N_Body_System
{
static std::array<body,5> bodies;
void offset_momentum()
{
unsigned int k;
for(auto &body: bodies)
for(k = 0; k < 3; ++k)
bodies[0].v[k] -= body.v[k] * body.mass / SOLAR_MASS;
}
public:
N_Body_System()
{
offset_momentum();
}
void advance(double dt)
{
constexpr unsigned int N = ((bodies.size() - 1) * bodies.size()) / 2;
static double r[N][4];
static double mag[N];
unsigned int i, m;
__m128d dx[3], dsquared, distance, dmag;
i=0;
for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi)
{
auto bj(bi);
for(++bj; bj!=bodies.end(); ++bj, ++i)
for (m=0; m<3; ++m)
r[i][m] = bi->x[m] - bj->x[m];
}
for (i=0; i<N; i+=2)
{
for (m=0; m<3; ++m)
{
dx[m] = _mm_loadl_pd(dx[m], &r[i][m]);
dx[m] = _mm_loadh_pd(dx[m], &r[i+1][m]);
}
dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
distance = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared)));
for (m=0; m<2; ++m)
distance = distance * _mm_set1_pd(1.5)
- ((_mm_set1_pd(0.5) * dsquared) * distance)
* (distance * distance);
dmag = _mm_set1_pd(dt) / (dsquared) * distance;
_mm_store_pd(&mag[i], dmag);
}
i=0;
for(auto bi(bodies.begin()); bi!=bodies.end(); ++bi)
{
auto bj(bi);
for(++bj; bj!=bodies.end(); ++bj, ++i)
for(m=0; m<3; ++m)
{
const double x = r[i][m] * mag[i];
bi->v[m] -= x * bj->mass;
bj->v[m] += x * bi->mass;
}
}
for(auto &body: bodies)
for(m=0; m<3; ++m)
body.x[m] += dt * body.v[m];
}
double energy()
{
double e(0.0);
for(auto bi(bodies.cbegin()); bi!=bodies.cend(); ++bi)
{
e += bi->mass * ( bi->v[0] * bi->v[0]
+ bi->v[1] * bi->v[1]
+ bi->v[2] * bi->v[2] ) / 2.;
auto bj(bi);
for(++bj; bj!=bodies.end(); ++bj)
{
double distance = 0;
for(auto k=0; k<3; ++k)
{
const double dx = bi->x[k] - bj->x[k];
distance += dx * dx;
}
e -= (bi->mass * bj->mass) / std::sqrt(distance);
}
}
return e;
}
};
std::array<body,5> N_Body_System::bodies{{
/* sun */
body(0., 0., 0. ,
0., 0., 0. ,
SOLAR_MASS),
/* jupiter */
body(4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01 ,
1.66007664274403694e-03 * DAYS_PER_YEAR,
7.69901118419740425e-03 * DAYS_PER_YEAR,
-6.90460016972063023e-05 * DAYS_PER_YEAR ,
9.54791938424326609e-04 * SOLAR_MASS
),
/* saturn */
body(8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01 ,
-2.76742510726862411e-03 * DAYS_PER_YEAR,
4.99852801234917238e-03 * DAYS_PER_YEAR,
2.30417297573763929e-05 * DAYS_PER_YEAR ,
2.85885980666130812e-04 * SOLAR_MASS
),
/* uranus */
body(1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01 ,
2.96460137564761618e-03 * DAYS_PER_YEAR,
2.37847173959480950e-03 * DAYS_PER_YEAR,
-2.96589568540237556e-05 * DAYS_PER_YEAR ,
4.36624404335156298e-05 * SOLAR_MASS
),
/* neptune */
body(1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01 ,
2.68067772490389322e-03 * DAYS_PER_YEAR,
1.62824170038242295e-03 * DAYS_PER_YEAR,
-9.51592254519715870e-05 * DAYS_PER_YEAR ,
5.15138902046611451e-05 * SOLAR_MASS
)
}};
int main(int , char** argv)
{
int i, n = atoi(argv[1]);
N_Body_System system;
printf("%.9f\n", system.energy());
for (i = 0; i < n; ++i)
system.advance(0.01);
printf("%.9f\n", system.energy());
return 0;
}
接下来给出Java版本:
/* The Computer Language Benchmarks Game
http://benchmarksgame.alioth.debian.org/
contributed by Mark C. Lewis
modified slightly by Chad Whipkey
modified slightly by Stefan Feldbinder
modified slightly by Tagir Valeev
*/
public final class nbody {
public static void main(String[] args) {
int n = Integer.parseInt(args[0]);
NBodySystem bodies = new NBodySystem();
System.out.printf("%.9f\n", bodies.energy());
for (int i=0; i<n; ++i)
bodies.advance(0.01);
System.out.printf("%.9f\n", bodies.energy());
}
}
final class NBodySystem {
private static final int LENGTH = 5;
private Body[] bodies;
public NBodySystem(){
bodies = new Body[]{
Body.sun(),
Body.jupiter(),
Body.saturn(),
Body.uranus(),
Body.neptune()
};
double px = 0.0;
double py = 0.0;
double pz = 0.0;
for(int i=0; i < LENGTH; ++i) {
px += bodies[i].vx * bodies[i].mass;
py += bodies[i].vy * bodies[i].mass;
pz += bodies[i].vz * bodies[i].mass;
}
bodies[0].offsetMomentum(px,py,pz);
}
public void advance(double dt) {
Body[] b = bodies;
for(int i=0; i < LENGTH-1; ++i) {
Body iBody = b[i];
double iMass = iBody.mass;
double ix = iBody.x, iy = iBody.y, iz = iBody.z;
for(int j=i+1; j < LENGTH; ++j) {
Body jBody = b[j];
double dx = ix - jBody.x;
double dy = iy - jBody.y;
double dz = iz - jBody.z;
double dSquared = dx * dx + dy * dy + dz * dz;
double distance = Math.sqrt(dSquared);
double mag = dt / (dSquared * distance);
double jMass = jBody.mass;
iBody.vx -= dx * jMass * mag;
iBody.vy -= dy * jMass * mag;
iBody.vz -= dz * jMass * mag;
jBody.vx += dx * iMass * mag;
jBody.vy += dy * iMass * mag;
jBody.vz += dz * iMass * mag;
}
}
for(int i=0; i < LENGTH; ++i) {
Body body = b[i];
body.x += dt * body.vx;
body.y += dt * body.vy;
body.z += dt * body.vz;
}
}
public double energy(){
double dx, dy, dz, distance;
double e = 0.0;
for (int i=0; i < bodies.length; ++i) {
Body iBody = bodies[i];
e += 0.5 * iBody.mass *
( iBody.vx * iBody.vx
+ iBody.vy * iBody.vy
+ iBody.vz * iBody.vz );
for (int j=i+1; j < bodies.length; ++j) {
Body jBody = bodies[j];
dx = iBody.x - jBody.x;
dy = iBody.y - jBody.y;
dz = iBody.z - jBody.z;
distance = Math.sqrt(dx*dx + dy*dy + dz*dz);
e -= (iBody.mass * jBody.mass) / distance;
}
}
return e;
}
}
final class Body {
static final double PI = 3.141592653589793;
static final double SOLAR_MASS = 4 * PI * PI;
static final double DAYS_PER_YEAR = 365.24;
public double x, y, z, vx, vy, vz, mass;
public Body(){}
static Body jupiter(){
Body p = new Body();
p.x = 4.84143144246472090e+00;
p.y = -1.16032004402742839e+00;
p.z = -1.03622044471123109e-01;
p.vx = 1.66007664274403694e-03 * DAYS_PER_YEAR;
p.vy = 7.69901118419740425e-03 * DAYS_PER_YEAR;
p.vz = -6.90460016972063023e-05 * DAYS_PER_YEAR;
p.mass = 9.54791938424326609e-04 * SOLAR_MASS;
return p;
}
static Body saturn(){
Body p = new Body();
p.x = 8.34336671824457987e+00;
p.y = 4.12479856412430479e+00;
p.z = -4.03523417114321381e-01;
p.vx = -2.76742510726862411e-03 * DAYS_PER_YEAR;
p.vy = 4.99852801234917238e-03 * DAYS_PER_YEAR;
p.vz = 2.30417297573763929e-05 * DAYS_PER_YEAR;
p.mass = 2.85885980666130812e-04 * SOLAR_MASS;
return p;
}
static Body uranus(){
Body p = new Body();
p.x = 1.28943695621391310e+01;
p.y = -1.51111514016986312e+01;
p.z = -2.23307578892655734e-01;
p.vx = 2.96460137564761618e-03 * DAYS_PER_YEAR;
p.vy = 2.37847173959480950e-03 * DAYS_PER_YEAR;
p.vz = -2.96589568540237556e-05 * DAYS_PER_YEAR;
p.mass = 4.36624404335156298e-05 * SOLAR_MASS;
return p;
}
static Body neptune(){
Body p = new Body();
p.x = 1.53796971148509165e+01;
p.y = -2.59193146099879641e+01;
p.z = 1.79258772950371181e-01;
p.vx = 2.68067772490389322e-03 * DAYS_PER_YEAR;
p.vy = 1.62824170038242295e-03 * DAYS_PER_YEAR;
p.vz = -9.51592254519715870e-05 * DAYS_PER_YEAR;
p.mass = 5.15138902046611451e-05 * SOLAR_MASS;
return p;
}
static Body sun(){
Body p = new Body();
p.mass = SOLAR_MASS;
return p;
}
Body offsetMomentum(double px, double py, double pz){
vx = -px / SOLAR_MASS;
vy = -py / SOLAR_MASS;
vz = -pz / SOLAR_MASS;
return this;
}
}
那么导致C ++性能差异的因素是什么(我可以从编译器标志中看到C ++正在使用SSE扩展,包括浮点数学,而Java正在使用java.lang.Math
)?特别是与_mm_loadl_pd
,_mm_loadh_pd
,_mm_cvtps_pd
和_mm_rsqrt_ps
指令集相对应的内容?
如果性能提升来自浮点SSE扩展,Java是否也能够使用这样的扩展?
答案 0 :(得分:1)
正如您在桌面上看到的那样,具有自动内存管理功能的语言总是很慢。解释语言也会编译。 Java代码和C#代码它不是指定体系结构的可执行文件。同时,C ++使得可执行文件适用于指定的体系结构。
如果你需要超快,你应该使用C ++。但通常不需要这种速度。
选择语言是一项非常困难的任务,我们首先必须从设定的任务开始。
答案 1 :(得分:1)
有两个优化正在发挥作用。第一个是将两个双打包装成两行中的__m128d
dx[m] = _mm_loadl_pd(dx[m], &r[i][m]);
dx[m] = _mm_loadh_pd(dx[m], &r[i+1][m]);
实际上,您在某种程度上通过一次处理位移(r[i]
和r[i+1]
)两个来并行化内循环,这就是为什么外部循环使用i+=2
而不是++i
递增。
如果您同时打包r[i][m]
和r[i+1][m]
的值而不是将它们分成两行分别加载,程序实际上可以更快:
dx[m] = _mm_set_pd(r[i+1][m], r[i][m]);
在我的机器上,这进一步缩短了4%的计算时间。
另一个优化来自于使用_mm_rsqrt_ps
,单精度倒数平方根的内在因素。现在我不确定这种优化是否是故意的,或者作者是否被迫使用它,因为我找不到双精度平方根或倒数平方根的任何内在函数用于__m128d
。但无论如何,单精度平方根比双精度平方根要快。
为了使用单精度算术,打包__m128d
首先被_mm_cvtpd_ps
转换为四个单打(两个从两个双打转换,另外两个是零)。然后_mm_rsqrt_ps
同时取四个单打的倒数平方根。两个单曲的结果然后被_mm_cvtps_pd
转换回双打(并且其他两个被删除),所以你在程序中有以下行:
_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dsquared)));
不幸的是,单精度倒数平方根不如双精度平方根准确。为了纠正错误,我们使用通常在入门微积分课程中引入的一阶泰勒展开。让
y = f(a) := a^{-1/2}
然后,如果我们通过一个小的a
扰乱h
,那么关于h=0
的一阶泰勒展开式由
f(a+h) ≈ f(a) + hf'(a) = f(a) - (a+h - a)a^{-3/2}/2 = 1.5f(a) - 0.5(a+h)f(a)^3
现在,概念上dSquared=a+h
和distance=f(a)
的某些a
接近h
(或某些h
接近零)。因此,上面的泰勒展开给出了
f(dSquared) ≈ 1.5 * distance - 0.5 * dSquared * distance * distance * distance
但是,请记住,我们已将两个双打打包到__m128d
。所以,我们不能将数字乘以1.5或0.5。相反,我们将两个等于1.5的双精度值打包成__m128d
,并对0.5进行类似的操作。这就是你在程序中得到以下一行的方式:
distance = distance * _mm_set1_pd(1.5)
- ((_mm_set1_pd(0.5) * dsquared) * distance)
* (distance * distance);
请注意,作者使用倒数平方根而不是平方根。如果使用平方根(没有倒数),可以使用Babylonian method来纠正数字错误,程序的相关部分将变为
distance = _mm_cvtps_pd(_mm_sqrt_ps(_mm_cvtpd_ps(dsquared)));
// now dsquared == distance^2 rather than dsquared == 1/distance^2
// Babylonian method: to find the square root x of a number S,
// iterate x_{n+1} = 0.5 * (x_n + S/x_n)
for (m=0; m<2; ++m)
distance = _mm_set1_pd(0.5) * (distance + dsquared/distance);
// was: dmag = _mm_set1_pd(dt) / dsquared * distance;
dmag = _mm_set1_pd(dt) / (dsquared * distance);
但是,在Babylonian方法的迭代中,额外的除法dsquared/distance
可能会使程序变慢。此外,似乎两次迭代不足以给出准确的结果。因此,应该在这里使用倒数平方根,因为它会产生更快更准确的程序。