仅适用于不适合堆栈的大型阵列。这是使用gfortran和Intel Fortran编译器的代码和时序。 Windows 10平台分别与编译器标志-Ofast
program matrix_multiply
implicit none
integer, parameter :: n = 1500
real(8) :: a(n,n), b(n,n), c(n,n), aT(n,n) ! plain arrays
integer :: i, j, k, ts, te, count_rate, count_max
real(8) :: tmp
! real(8), allocatable :: A(:,:), B(:,:), C(:,:), aT(:,:) ! allocatable arrays
! allocate ( a(n,n), b(n,n), c(n,n), aT(n,n) )
do i = 1,n
do j = 1,n
a(i,j) = 1.d0/n/n * (i-j) * (i+j)
b(i,j) = 1.d0/n/n * (i-j) * (i+j)
end do
end do
! transpose for cache-friendliness
do i = 1,n
do j = 1,n
aT(j,i) = a(i,j)
end do
end do
call system_clock(ts, count_rate, count_max)
do i = 1,n
do j = 1,n
tmp = 0
do k = 1,n
tmp = tmp + aT(k,i) * b(k,j)
end do
c(i,j) = tmp
end do
end do
call system_clock(te)
print '(4G0)', "Elapsed time: ", real(te-ts)/count_rate,', c_(n/2+1) = ', c(n/2+1,n/2+1)
end program matrix_multiply
! Intel Fortran
! -------------
Elapsed time: 1.546000, c_(n/2+1) = -143.8334 ! Plain Arrays
Elapsed time: 1.417000, c_(n/2+1) = -143.8334 ! Allocatable Arrays
! gfortran:
! -------------
Elapsed time: 1.827999, c_(n/2+1) = -143.8334 ! Plain Arrays
Elapsed time: 1.702999, c_(n/2+1) = -143.8334 ! Allocatable Arrays
冒着延长问题的风险,这是另一个示例,其中Intel Fortran编译器表现出相同的行为:
program testArrays
implicit none
integer, parameter :: m = 1223, n = 2015
real(8), parameter :: pi = acos(-1.d0)
real(8) :: a(m,n)
real(8), allocatable :: b(:,:)
real(8), pointer :: c(:,:)
integer :: i, sz = min(m, n), t0, t1, count_rate, count_max
allocate( b(m,n), c(m,n) )
call random_seed()
call random_number(a)
call random_number(b)
call random_number(c)
call system_clock(t0, count_rate, count_max)
do i=1,1000
call doit(a,sz)
end do
call system_clock(t1)
print '(4g0)', 'Time plain: ', real(t1-t0)/count_rate, ', sum 3x3 = ', sum( a(1:3,1:3) )
call system_clock(t0)
do i=1,1000
call doit(b,sz)
end do
call system_clock(t1)
print '(4g0)', 'Time alloc: ', real(t1-t0)/count_rate, ', sum 3x3 = ', sum( b(1:3,1:3) )
call system_clock(t0)
do i=1,1000
call doitp(c,sz)
end do
call system_clock(t1)
print '(4g0)', 'Time p.ptr: ', real(t1-t0)/count_rate, ', sum 3x3 = ', sum( c(1:3,1:3) )
subroutine doit(a,sz)
real(8) :: a(:,:)
integer :: sz
a(1:sz,1:sz) = sin(2*pi*a(1:sz,1:sz))/(a(1:sz,1:sz)+1)
subroutine doitp(a,sz)
real(8), pointer :: a(:,:)
integer :: sz
a(1:sz,1:sz) = sin(2*pi*a(1:sz,1:sz))/(a(1:sz,1:sz)+1)
end program testArrays
Time plain: 2.857000, sum 3x3 = -.9913536
Time alloc: 2.750000, sum 3x3 = .4471794
Time p.ptr: 2.786000, sum 3x3 = 2.036269
Time plain: 51.5600014, sum 3x3 = 6.2749456118192093
Time alloc: 54.0300007, sum 3x3 = 6.4144775892064283
Time p.ptr: 54.1900034, sum 3x3 = -2.1546109819149963
答案 0 :(得分:1)
program matrix_multiply
implicit none
integer, parameter :: n = 1500
!real(8) :: a(n,n), b(n,n), c(n,n), aT(n,n) ! plain arrays
integer :: i, j, k, ts, te, count_rate, count_max
real(8) :: tmp
real(8), allocatable :: A(:,:), B(:,:), C(:,:), aT(:,:) ! allocatable arrays
allocate ( a(n,n), b(n,n), c(n,n), aT(n,n) )
do i = 1,n
do j = 1,n
a(i,j) = 1.d0/n/n * (i-j) * (i+j)
b(i,j) = 1.d0/n/n * (i-j) * (i+j)
end do
end do
! transpose for cache-friendliness
do i = 1,n
do j = 1,n
aT(j,i) = a(i,j)
end do
end do
call system_clock(ts, count_rate, count_max)
do i = 1,n
do j = 1,n
tmp = 0
do k = 1,n
tmp = tmp + aT(k,i) * b(k,j)
end do
c(i,j) = tmp
end do
end do
call system_clock(te)
print '(4G0)', "Elapsed time: ", real(te-ts)/count_rate,', c_(n/2+1) = ', c(n/2+1,n/2+1)
end program matrix_multiply
在Windows上与Intel Fortran编译器18.0.2编译并打开了优化标志,
ifort /standard-semantics /F0x1000000000 /O3 /Qip /Qipo /Qunroll /Qunroll-aggressive /inline:all /Ob2 main.f90 -o run.exe
Elapsed time: 1.580000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.560000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.555000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.588000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.551000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.566000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.555000, c_(n/2+1) = -143.8334 ! plain arrays
Elapsed time: 1.634000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.634000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.602000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.623000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.597000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.607000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.617000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.606000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.626000, c_(n/2+1) = -143.8334 ! allocatable arrays
Elapsed time: 1.614000, c_(n/2+1) = -143.8334 ! allocatable arrays
如您所见,平均而言,可分配数组实际上稍慢一些,这正是我期望看到的,这也与您的观察结果相矛盾。我可以看到的唯一差异来源是所使用的优化标志,尽管我不确定这将如何产生差异。也许您想要在没有优化且具有不同优化级别的多种不同模式下运行测试,并查看在所有模式下是否获得一致的性能差异。要获取有关所使用的优化标志的含义的更多信息,请参见Intel's reference page。
use, intrinsic :: iso_fortran_env, only: real64, int32
integer(int32), parameter :: n=100
real(real64) :: a(n)
int8 ! 8-bit integer
int16 ! 16-bit integer
int32 ! 32-bit integer
int64 ! 64-bit integer
real32 ! 32-bit real
real64 ! 64-bit real
real128 ! 128-bit real
program complex
use, intrinsic :: iso_fortran_env, only: RK => real64, output_unit
! the intrinsic attribute above is not essential, but recommended, so this would be also valid:
! use iso_fortran_env, only: RK => real64, output_unit
complex(RK) :: z = (1._RK, 2._RK)
write(output_unit,"(*(g0,:,' '))") "Hello World! This is a complex variable:", z
end program complex
$gfortran -std=f2008 *.f95 -o main
Hello World! This is a complex variable: 1.0000000000000000 2.0000000000000000
请注意,这需要符合Fortran 2008的编译器。 iso_fortran_env
答案 1 :(得分:1)
响应“我认为可分配内存仅适用于不适合堆栈的大型阵列” -需要可分配(即您没有真正的选择),当您无法确定负责整个事物存在的过程的规范部分中分配的事物的大小或其他特征时。即使对于直到运行时才知道的东西,如果您仍然可以在相关过程的规范部分中确定特征,那么可以使用自动变量。 (尽管您的示例中没有自动变量-在不可分配,非指针的情况下,数组的所有特征在编译时都是已知的。)在Fortran处理器实现级别,该级别在编译器和编译选项之间有所不同,自动变量可能需要比可用变量更多的堆栈空间,这可能会导致可分配变量可以缓解的问题(或者您可以更改编译器选项)。