我的目标是调查我在静态和动态分配之间观察到的CPU时间差异,具体取决于是否连续访问内存。
为了使这个调查尽可能合理,我用C ++和Fortran程序引导它。那些尽可能简单,核心部分在于计算两个随机填充的矩阵乘法。这是C ++代码:
#include <iostream>
#include <iomanip>
#include <sstream>
#include <random>
#include <string>
#include <chrono>
#include <ctime>
using namespace std;
#ifdef ALLOCATION_DYNAMIC
//
// Use a home made matrix class when dynamically allocating.
//
class matrix
{
private:
int n_;
int m_;
double *data_;
public:
matrix();
~matrix();
double* operator[](int i);
void resize(int n, int m);
double& operator()(int i, int j);
const double& operator()(int i, int j) const;
};
matrix::matrix() : n_(0), m_(0), data_(NULL)
{
return;
}
matrix::~matrix()
{
if (data_) delete[] data_;
return;
}
void matrix::resize(int n, int m)
{
if (data_) delete[] data_;
n_ = n;
m_ = m;
data_ = new double[n_ * m_];
}
inline double& matrix::operator()(int i, int j)
{
return *(data_ + i * m_ + j);
}
inline const double& matrix::operator()(int i, int j) const
{
return *(data_ + i * m_ + j);
}
#endif
// Record the optimization flag we were compiled with.
string optflag = OPTFLAG;
//
// Main program.
//
int main(int argc, char *argv[])
{
cout << "optflag " << optflag;
#ifdef ALLOCATION_DYNAMIC
int n = N;
matrix cc1;
matrix cc2;
matrix cc3;
#else
const int n = N;
// It is necessary to specify the static keyword
// because the default is "automatic", so that
// data is entirely put on the stack which quickly
// get overflowed with greater N values.
static double cc1[N][N];
static double cc2[N][N];
static double cc3[N][N];
#endif
cout << " allocation ";
#ifdef ALLOCATION_DYNAMIC
cout << "dynamic";
if (argc > 1)
{
istringstream iss(argv[1]);
iss >> n;
}
cc1.resize(n, n);
cc2.resize(n, n);
cc3.resize(n, n);
#else
cout << "static";
#endif
cout << " N " << n << flush;
// Init.
string seed = SEED;
std::seed_seq seed_sequence (seed.begin(), seed.end());
// Standard, 64 bit based, Mersenne Twister random engine.
std::mt19937_64 generator (seed_sequence);
// Random number between [0, 1].
std::uniform_real_distribution<double> random_unity(double(0), double(1));
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
#ifdef ALLOCATION_DYNAMIC
cc1(i, j) = random_unity(generator);
cc2(i, j) = random_unity(generator);
cc3(i, j) = double(0);
#else
cc1[i][j] = random_unity(generator);
cc2[i][j] = random_unity(generator);
cc3[i][j] = double(0);
#endif
}
clock_t cpu_begin = clock();
auto wall_begin = std::chrono::high_resolution_clock::now();
cout << " transpose ";
#ifdef TRANSPOSE
cout << "yes";
// Transpose.
for (int i = 0; i < n; ++i)
for (int j = 0; j < i; ++j)
{
#ifdef ALLOCATION_DYNAMIC
double tmp = cc2(i, j);
cc2(i, j) = cc2(j, i);
cc2(j, i) = tmp;
#else
double tmp = cc2[i][j];
cc2[i][j] = cc2[j][i];
cc2[j][i] = tmp;
#endif
}
#else
cout << "no";
#endif
cout << flush;
// Work.
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
for (int k = 0; k < n; ++k)
{
#if defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
cc3(i, j) += cc1(i, k) * cc2(j, k);
#elif defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
cc3(i, j) += cc1(i, k) * cc2(k, j);
#elif ! defined(ALLOCATION_DYNAMIC) && defined(TRANSPOSE)
cc3[i][j] += cc1[i][k] * cc2[j][k];
#elif ! defined(ALLOCATION_DYNAMIC) && ! defined(TRANSPOSE)
cc3[i][j] += cc1[i][k] * cc2[k][j];
#else
#error("Wrong preprocess instructions.");
#endif
}
clock_t cpu_end = clock();
auto wall_end = std::chrono::high_resolution_clock::now();
double sum(0);
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
#ifdef ALLOCATION_DYNAMIC
sum += cc3(i, j);
#else
sum += cc3[i][j];
#endif
}
sum /= double(n * n);
cout << " cpu " << setprecision(16) << double(cpu_end - cpu_begin) / double(CLOCKS_PER_SEC)
<< " wall " << setprecision(16) << std::chrono::duration<double>(wall_end - wall_begin).count()
<< " sum " << setprecision(16) << sum << endl;
return 0;
}
这是Fortran代码:
program Test
#ifdef ALLOCATION_DYNAMIC
integer :: n = N
double precision, dimension(:,:), allocatable :: cc1
double precision, dimension(:,:), allocatable :: cc2
double precision, dimension(:,:), allocatable :: cc3
#else
integer, parameter :: n = N
double precision, dimension(n,n) :: cc1
double precision, dimension(n,n) :: cc2
double precision, dimension(n,n) :: cc3
#endif
character(len = 5) :: optflag = OPTFLAG
character(len = 8) :: time = SEED
#ifdef ALLOCATION_DYNAMIC
character(len = 10) :: arg
#endif
#ifdef TRANSPOSE
double precision :: tmp
#endif
double precision :: sum
double precision :: cpu_start, cpu_end, wall_start, wall_end
integer :: clock_reading, clock_rate, clock_max
integer :: i, j, k, s
double precision, dimension(2) :: harvest
integer, dimension(:), allocatable :: seed
write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
if (iargc().gt.0) then
call getarg(1, arg)
read(arg, '(I8)') n
end if
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n
#ifdef ALLOCATION_DYNAMIC
allocate(cc1(n, n))
allocate(cc2(n, n))
allocate(cc3(n, n))
#endif
! Init.
call random_seed(size = s)
allocate(seed(s))
seed = 0
read(time(1:2), '(I2)') seed(1)
read(time(4:5), '(I2)') seed(2)
read(time(7:8), '(I2)') seed(3)
call random_seed(put = seed)
deallocate(seed)
do i = 1, n
do j = 1, n
call random_number(harvest)
cc1(i, j) = harvest(1)
cc2(i, j) = harvest(2)
cc3(i, j) = dble(0)
enddo
enddo
write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
#ifdef TRANSPOSE
write(6, FMT = '(A)', ADVANCE = 'NO') "yes"
! Transpose.
do j = 1, n
do i = 1, j - 1
tmp = cc1(i, j)
cc1(i, j) = cc1(j, i)
cc1(j, i) = tmp
enddo
enddo
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "no"
#endif
call cpu_time(cpu_start)
call system_clock (clock_reading, clock_rate, clock_max)
wall_start = dble(clock_reading) / dble(clock_rate)
! Work.
do j = 1, n
do i = 1, n
do k = 1, n
#ifdef TRANSPOSE
cc3(i, j) = cc3(i, j) + cc1(k, i) * cc2(k, j)
#else
cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
#endif
enddo
enddo
enddo
sum = dble(0)
do j = 1, n
do i = 1, n
sum = sum + cc3(i, j)
enddo
enddo
sum = sum / (n * n)
call cpu_time(cpu_end)
call system_clock (clock_reading, clock_rate, clock_max)
wall_end = dble(clock_reading) / dble(clock_rate)
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
write(6, FMT = '(A,F23.16)') " sum ", sum
#ifdef ALLOCATION_DYNAMIC
deallocate(cc1)
deallocate(cc2)
deallocate(cc3)
#endif
end program Test
我尝试使两个程序尽可能相似,考虑到C / C ++是行顺序主要而Fortran是列顺序主要。
尽可能连续读取矩阵,例外是矩阵乘法,因为当执行C = A x B时,A通常按行读取,而B则按列读取。
两个程序都可以通过让一个矩阵,A或B取决于语言,不按顺序访问,或通过转置矩阵A或B,然后在矩阵乘法期间连续读取,即通过传递TRANSPOSE预处理指令实现。
以下行给出了编译过程的所有细节((GCC)4.8.1):
gfortran -o f90-dynamic -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" -DALLOCATION_DYNAMIC src/test.f90
gfortran -o f90-static -cpp -Wall -pedantic -fimplicit-none -O3 -DOPTFLAG=\"-O3\" -DTRANSPOSE -DN=1000 -DSEED=\"15:11:18\" src/test.f90
g++ -o cpp-dynamic -Wall -pedantic -std=gnu++0x -O3 -DALLOCATION_DYNAMIC -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp
g++ -o cpp-static -Wall -pedantic -std=gnu++0x -O3 -DN=1000 -DOPTFLAG=\"-O3\" -DSEED=\"15:11:18\" -DTRANSPOSE src/test.cpp
这四行产生四个程序,其中A或B矩阵最初被转置.N预处理指令初始化默认矩阵大小,在使用静态字段时必须在编译时知道。这是要注意到所有程序都是以我目前所知的最高优化度(O3)编译的。
我运行了所有生成的程序,用于各种矩阵大小,从1000到5000.结果绘制在下面的图中,每种情况一个(转置与否):
主机系统
Intel(R)Xeon(R)CPU E5-2670 0 @ 2.60GHz
,堆栈大小为(ulimit -s)10240。
对于每个点,我运行了几次相同的程序,直到CPU时间的标准偏差与其平均值相比可以忽略不计。 方形和圆形分别代表Fortran和C ++,红色和绿色代表动态或静态。
在转置测试中,计算时间非常接近,尤其是主要区别来自语言(Fortran与C ++),动态与静态分配几乎没有差别。但是,静态分配似乎更快,特别是对于C ++。
在无转置测试中,计算时间明显更长,这是预期的,因为它不是顺序访问内存的速度较慢,但CPU时间的行为与以前不同:
我想了解无转置测试中会发生什么:
我的工作团队面临着同样的问题:在(更大的)Fortran程序中从静态切换到动态分配时,CPU时间显着增加。
非常感谢您的帮助。答案 0 :(得分:1)
最重要的部分应该是知识,编译器可以使用。
静态版本具有固定的数组大小,编译器可以使用它来强制执行更好的优化。例如。矩阵行之间的距离是固定的(cc3(n,1)
在Fortran内存中的cc3(1,2)
旁边),而动态数组可能有一些填充(可能有一个元素cc3(n+1,1)
。实际上,看看-fopt-info-optimized
的输出,我们可以看到,l.95的循环只在静态情况下得到优化。
为了检查这一点,我修改了程序以使用线性数组来表示矩阵。对程序进行定时,我在静态和动态分配之间没有显着的时间差异,并且使用相同速度执行具有最佳循环顺序的2d版本。
program Test
#ifdef ALLOCATION_DYNAMIC
integer :: n = N
double precision, dimension(:), allocatable :: cc1
double precision, dimension(:), allocatable :: cc2
double precision, dimension(:), allocatable :: cc3
#else
integer, parameter :: n = N
double precision, dimension(n**2) :: cc1
double precision, dimension(n**2) :: cc2
double precision, dimension(n**2) :: cc3
#endif
character(len = 5) :: optflag = OPTFLAG
character(len = 8) :: time = SEED
#ifdef ALLOCATION_DYNAMIC
character(len = 10) :: arg
#endif
double precision :: sum
double precision :: cpu_start, cpu_end, wall_start, wall_end
integer :: clock_reading, clock_rate, clock_max
integer :: i, j, k, s
double precision, dimension(2) :: harvest
integer, dimension(:), allocatable :: seed
write(6, FMT = '(A,A)', ADVANCE = 'NO') "optflag ", optflag
write(6, FMT = '(A)', ADVANCE = 'NO') " allocation "
#ifdef ALLOCATION_DYNAMIC
write(6, FMT = '(A)', ADVANCE = 'NO') "dynamic"
if (iargc().gt.0) then
call getarg(1, arg)
read(arg, '(I8)') n
end if
#else
write(6, FMT = '(A)', ADVANCE = 'NO') "static"
#endif
write(6, FMT = '(A,I8)', ADVANCE = 'NO') " N ", n
#ifdef ALLOCATION_DYNAMIC
allocate(cc1(n**2))
allocate(cc2(n**2))
allocate(cc3(n**2))
#endif
! Init.
call random_seed(size = s)
allocate(seed(s))
seed = 0
read(time(1:2), '(I2)') seed(1)
read(time(4:5), '(I2)') seed(2)
read(time(7:8), '(I2)') seed(3)
call random_seed(put = seed)
deallocate(seed)
do i = 1, n**2
call random_number(harvest)
cc1(i) = harvest(1)
cc2(i) = harvest(2)
cc3(i) = dble(0)
enddo
write(6, FMT = '(A)', ADVANCE = 'NO') " transpose "
write(6, FMT = '(A)', ADVANCE = 'NO') "no"
call cpu_time(cpu_start)
call system_clock (clock_reading, clock_rate, clock_max)
wall_start = dble(clock_reading) / dble(clock_rate)
! Work.
do j = 1, n
do i = 1, n
do k = 1, n
cc3((j-1)*n+i) = cc3((j-1)*n+i) + cc1((i-1)*n+k) * cc2((j-1)*n+k)
enddo
enddo
enddo
sum = dble(0)
do j = 1, n**2
sum = sum + cc3(i)
enddo
sum = sum / (n * n)
call cpu_time(cpu_end)
call system_clock (clock_reading, clock_rate, clock_max)
wall_end = dble(clock_reading) / dble(clock_rate)
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " cpu ", cpu_end - cpu_start
write(6, FMT = '(A,F23.16)', ADVANCE = 'NO') " wall ", wall_end - wall_start
write(6, FMT = '(A,F23.16)') " sum ", sum
#ifdef ALLOCATION_DYNAMIC
deallocate(cc1)
deallocate(cc2)
deallocate(cc3)
#endif
end program Test
答案 1 :(得分:1)
显然,非转置测试中静态和动态分配之间的CPU时间差异是由于
do j = 1, n
do i = 1, n
do k = 1, n
cc3(i, j) = cc3(i, j) + cc1(i, k) * cc2(k, j)
enddo
enddo
enddo
用于Fortran 90程序和
// Work.
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
for (int k = 0; k < n; ++k)
{
#if defined(ALLOCATION_DYNAMIC)
cc3(i, j) += cc1(i, k) * cc2(k, j);
#else
cc3[i][j] += cc1[i][k] * cc2[k][j];
#endif
}
用于C ++版本。
编译器能够进行更深入的优化(-fopt-info-optimize
)
当使用静态分配时,在这种情况下它输出(F90 / C ++):
在src / test.cpp进行矢量化循环:163
src / test.cpp:163:注意:LOOP VECTORIZED。
src / test.cpp:163:注意:外部循环矢量。
它对动态分配没有任何作用,我对此感到非常惊讶,我不明白为什么编译器可以在静态分配时优化这种非连续内存访问(cc3[k][j]
)而不是在动态分配时({{ 1}})。