我想在Python中执行多项式计算。 polynomial
中的numpy
包对我来说不够快。因此,我决定在Fortran中重写几个函数,并使用f2py
创建可以轻松导入Python的共享库。目前,我正在根据他们的numpy
对应物进行单变量和双变量多项式评估的基准测试。
在单变量例程中,我使用Horner's method和numpy.polynomial.polynomial.polyval
一样。我观察到,随着多项式阶数的增加,Fortran例程比numpy
对象更快的因子会增加。
在双变量例程中,我使用Horner的方法两次。首先是y,然后是x。不幸的是,我观察到,为了增加多项式阶数,numpy
对应物会赶上并最终超过我的Fortran例程。由于numpy.polynomial.polynomial.polyval2d
使用类似于我的方法,我认为第二个观察是奇怪的。
我希望这个结果源于我对Fortran和f2py
的经验不足。有人可能有任何线索为什么单变量例程总是显得优越,而双变量例程仅优于低阶多项式?
修改 这是我最新的更新代码,基准脚本和性能图:
polynomial.f95
subroutine polyval(p, x, pval, nx)
implicit none
real(8), dimension(nx), intent(in) :: p
real(8), intent(in) :: x
real(8), intent(out) :: pval
integer, intent(in) :: nx
integer :: i
pval = 0.0d0
do i = nx, 1, -1
pval = pval*x + p(i)
end do
end subroutine polyval
subroutine polyval2(p, x, y, pval, nx, ny)
implicit none
real(8), dimension(nx, ny), intent(in) :: p
real(8), intent(in) :: x, y
real(8), intent(out) :: pval
integer, intent(in) :: nx, ny
real(8) :: tmp
integer :: i, j
pval = 0.0d0
do j = ny, 1, -1
tmp = 0.0d0
do i = nx, 1, -1
tmp = tmp*x + p(i, j)
end do
pval = pval*y + tmp
end do
end subroutine polyval2
subroutine polyval3(p, x, y, z, pval, nx, ny, nz)
implicit none
real(8), dimension(nx, ny, nz), intent(in) :: p
real(8), intent(in) :: x, y, z
real(8), intent(out) :: pval
integer, intent(in) :: nx, ny, nz
real(8) :: tmp, tmp2
integer :: i, j, k
pval = 0.0d0
do k = nz, 1, -1
tmp2 = 0.0d0
do j = ny, 1, -1
tmp = 0.0d0
do i = nx, 1, -1
tmp = tmp*x + p(i, j, k)
end do
tmp2 = tmp2*y + tmp
end do
pval = pval*z + tmp2
end do
end subroutine polyval3
benchmark.py (使用此脚本生成图表)
import time
import os
import numpy as np
import matplotlib.pyplot as plt
# Compile and import Fortran module
os.system('f2py -c polynomial.f95 --opt="-O3 -ffast-math" \
--f90exec="gfortran-4.8" -m polynomial')
import polynomial
# Create random x and y value
x = np.random.rand()
y = np.random.rand()
z = np.random.rand()
# Number of repetition
repetition = 10
# Number of times to loop over a function
run = 100
# Number of data points
points = 26
# Max number of coefficients for univariate case
n_uni_min = 4
n_uni_max = 100
# Max number of coefficients for bivariate case
n_bi_min = 4
n_bi_max = 100
# Max number of coefficients for trivariate case
n_tri_min = 4
n_tri_max = 100
# Case on/off switch
case_on = [1, 1, 1]
case_1_done = 0
case_2_done = 0
case_3_done = 0
#=================#
# UNIVARIATE CASE #
#=================#
if case_on[0]:
# Array containing the polynomial order + 1 for several univariate polynomials
n_uni = np.array([int(x) for x in np.linspace(n_uni_min, n_uni_max, points)])
# Initialise arrays for storing timing results
time_uni_numpy = np.zeros(n_uni.size)
time_uni_fortran = np.zeros(n_uni.size)
for i in xrange(len(n_uni)):
# Create random univariate polynomial of order n - 1
p = np.random.rand(n_uni[i])
# Time evaluation of polynomial using NumPy
dt = []
for j in xrange(repetition):
t1 = time.time()
for r in xrange(run): np.polynomial.polynomial.polyval(x, p)
t2 = time.time()
dt.append(t2 - t1)
time_uni_numpy[i] = np.average(dt[2::])
# Time evaluation of polynomial using Fortran
dt = []
for j in xrange(repetition):
t1 = time.time()
for r in xrange(run): polynomial.polyval(p, x)
t2 = time.time()
dt.append(t2 - t1)
time_uni_fortran[i] = np.average(dt[2::])
# Speed-up factor
factor_uni = time_uni_numpy / time_uni_fortran
results_uni = np.zeros([len(n_uni), 4])
results_uni[:, 0] = n_uni
results_uni[:, 1] = factor_uni
results_uni[:, 2] = time_uni_numpy
results_uni[:, 3] = time_uni_fortran
print results_uni, '\n'
plt.figure()
plt.plot(n_uni, factor_uni)
plt.title('Univariate comparison')
plt.xlabel('# coefficients')
plt.ylabel('Speed-up factor')
plt.xlim(n_uni[0], n_uni[-1])
plt.ylim(0, max(factor_uni))
plt.grid(aa=True)
case_1_done = 1
#================#
# BIVARIATE CASE #
#================#
if case_on[1]:
# Array containing the polynomial order + 1 for several bivariate polynomials
n_bi = np.array([int(x) for x in np.linspace(n_bi_min, n_bi_max, points)])
# Initialise arrays for storing timing results
time_bi_numpy = np.zeros(n_bi.size)
time_bi_fortran = np.zeros(n_bi.size)
for i in xrange(len(n_bi)):
# Create random bivariate polynomial of order n - 1 in x and in y
p = np.random.rand(n_bi[i], n_bi[i])
# Time evaluation of polynomial using NumPy
dt = []
for j in xrange(repetition):
t1 = time.time()
for r in xrange(run): np.polynomial.polynomial.polyval2d(x, y, p)
t2 = time.time()
dt.append(t2 - t1)
time_bi_numpy[i] = np.average(dt[2::])
# Time evaluation of polynomial using Fortran
p = np.asfortranarray(p)
dt = []
for j in xrange(repetition):
t1 = time.time()
for r in xrange(run): polynomial.polyval2(p, x, y)
t2 = time.time()
dt.append(t2 - t1)
time_bi_fortran[i] = np.average(dt[2::])
# Speed-up factor
factor_bi = time_bi_numpy / time_bi_fortran
results_bi = np.zeros([len(n_bi), 4])
results_bi[:, 0] = n_bi
results_bi[:, 1] = factor_bi
results_bi[:, 2] = time_bi_numpy
results_bi[:, 3] = time_bi_fortran
print results_bi, '\n'
plt.figure()
plt.plot(n_bi, factor_bi)
plt.title('Bivariate comparison')
plt.xlabel('# coefficients')
plt.ylabel('Speed-up factor')
plt.xlim(n_bi[0], n_bi[-1])
plt.ylim(0, max(factor_bi))
plt.grid(aa=True)
case_2_done = 1
#=================#
# TRIVARIATE CASE #
#=================#
if case_on[2]:
# Array containing the polynomial order + 1 for several bivariate polynomials
n_tri = np.array([int(x) for x in np.linspace(n_tri_min, n_tri_max, points)])
# Initialise arrays for storing timing results
time_tri_numpy = np.zeros(n_tri.size)
time_tri_fortran = np.zeros(n_tri.size)
for i in xrange(len(n_tri)):
# Create random bivariate polynomial of order n - 1 in x and in y
p = np.random.rand(n_tri[i], n_tri[i])
# Time evaluation of polynomial using NumPy
dt = []
for j in xrange(repetition):
t1 = time.time()
for r in xrange(run): np.polynomial.polynomial.polyval3d(x, y, z, p)
t2 = time.time()
dt.append(t2 - t1)
time_tri_numpy[i] = np.average(dt[2::])
# Time evaluation of polynomial using Fortran
p = np.asfortranarray(p)
dt = []
for j in xrange(repetition):
t1 = time.time()
for r in xrange(run): polynomial.polyval3(p, x, y, z)
t2 = time.time()
dt.append(t2 - t1)
time_tri_fortran[i] = np.average(dt[2::])
# Speed-up factor
factor_tri = time_tri_numpy / time_tri_fortran
results_tri = np.zeros([len(n_tri), 4])
results_tri[:, 0] = n_tri
results_tri[:, 1] = factor_tri
results_tri[:, 2] = time_tri_numpy
results_tri[:, 3] = time_tri_fortran
print results_tri
plt.figure()
plt.plot(n_bi, factor_bi)
plt.title('Trivariate comparison')
plt.xlabel('# coefficients')
plt.ylabel('Speed-up factor')
plt.xlim(n_tri[0], n_tri[-1])
plt.ylim(0, max(factor_tri))
plt.grid(aa=True)
print '\n'
case_3_done = 1
#==============================================================================
plt.show()
结果
EDIT 对steabert提案的更正
subroutine polyval(p, x, pval, nx)
implicit none
real*8, dimension(nx), intent(in) :: p
real*8, intent(in) :: x
real*8, intent(out) :: pval
integer, intent(in) :: nx
integer, parameter :: simd = 8
real*8 :: tmp(simd), xpower(simd), maxpower
integer :: i, j, k
xpower(1) = x
do i = 2, simd
xpower(i) = xpower(i-1)*x
end do
maxpower = xpower(simd)
tmp = 0.0d0
do i = nx+1, simd+2, -simd
do j = 1, simd
tmp(j) = tmp(j)*maxpower + p(i-j)*xpower(simd-j+1)
end do
end do
k = mod(nx-1, simd)
if (k == 0) then
pval = sum(tmp) + p(1)
else
pval = sum(tmp) + p(k+1)
do i = k, 1, -1
pval = pval*x + p(i)
end do
end if
end subroutine polyval
编辑测试代码以验证上面的代码是否为x>提供了不良结果1
import polynomial as P
import numpy.polynomial.polynomial as PP
import numpy as np
for n in xrange(2,100):
poly1n = np.random.rand(n)
poly1f = np.asfortranarray(poly1n)
x = 2
print np.linalg.norm(P.polyval(poly1f, x) - PP.polyval(x, poly1n)), '\n'
答案 0 :(得分:6)
在双变量情况下,p
是一个二维数组。这意味着C vs fortran对数组的排序是不同的。默认情况下,numpy函数提供C排序,显然fortran例程使用fortran排序。
f2py非常聪明,可以处理这个问题,并自动在C和fortran格式数组之间进行转换。但是,这会导致一些开销,这是性能降低的可能原因之一。您可以通过在计时例程之外使用p
手动将numpy.asfortranarray
转换为fortran类型来检查这是否是原因。当然,为了使其有意义,在您的实际用例中,您需要确保输入数组符合fortran顺序。
f2py有一个选项-DF2PY_REPORT_ON_ARRAY_COPY
,可以在任何时候复制数组时发出警告。
如果这不是原因,那么您需要考虑更深入的细节,例如您正在使用哪个fortran编译器,以及它正在应用哪种优化。可能会降低速度的一些例子包括在堆上而不是堆栈上分配数组(对malloc
进行昂贵的调用),尽管我认为这样的效果对于较大的数组来说不那么重要。
最后,你应该考虑对于大的N
,对于双变量拟合,numpy例程已经基本上处于最佳效率的可能性。在这种情况下,numpy例程可能花费大部分时间来运行优化的C例程,并且python代码的开销相比之下变得可以忽略不计。在这种情况下,您不希望您的fortran代码显示任何显着的加速。
答案 1 :(得分:3)
我猜,你的 tmp 数组太大,以至于它需要L2,L3甚至主内存访问而不是缓存。打破这些循环并立即处理它们的块(剥离挖掘)可能会更好。
答案 2 :(得分:1)
您的功能很短,因此通过内联polyval可以获得更好的效果。您还可以通过简单地反转循环来避免计算索引:
subroutine polyval2(p, x, y, pval, nx, ny)
implicit none
real(8), dimension(nx, ny), intent(in), target :: p
real(8), intent(in) :: x, y
real(8), intent(out) :: pval
integer, intent(in) :: nx, ny
real(8) :: tmp
integer :: i, ii
pval = 1.d0
do i = ny, 1
tmp = 1.d0
do ii = nx, 1
tmp = tmp*x + p(ii,i)
end do
pval = pval*y + tmp
end do
end subroutine polyval2
使用此代码,与您发布的原始代码相比,大型阵列的执行时间缩短了约10%。 (我测试了一个纯Fortran程序,代码为Nx = Ny = 1000,gfortran -O3 -funroll-loops
)
我同意haraldkl的观点,当尺寸变得太大时,性能的急剧下降对于缓存/内存访问模式来说非常典型。条带采矿有帮助,但我不鼓励自己这样做。请改为使用编译器标志:-floop-strip-mine
gfortran
和-O3
ifort
-funroll-loops
。另外,请尝试循环展开gfortran
ifort
和f2py -c --f90flags="..."
。
您可以使用{{1}}指定这些标记。
答案 3 :(得分:1)
遵循其他建议,在我测试之前,在计时器之前使用p=np.asfortranarray(p)
确实将性能与numpy相提并论。我将双变量工作台的范围扩展到n_bi = np.array([2**i for i in xrange(1, 15)])
,这样p矩阵就会大于我的L3缓存大小。
为了进一步优化这一点,我不认为自动编译器选项会有很大帮助,因为内部循环具有依赖性。只有在您手动展开它时,ifort
才会向内部循环进行矢量化。 gfortran
需要-O3
和-ffast-math
。对于受主存储器带宽限制的矩阵大小,这会使性能优势从numpy提高1到3倍。
更新:将此应用于单变量代码并使用f2py --opt='-O3 -ffast-math' -c -m polynomial polynomial.f90
进行编译后,我得到以下有关benchmark.py的来源和结果:
subroutine polyval(p, x, pval, nx)
implicit none
real*8, dimension(nx), intent(in) :: p
real*8, intent(in) :: x
real*8, intent(out) :: pval
integer, intent(in) :: nx
integer, parameter :: simd = 8
real*8 :: tmp(simd), vecx(simd), xfactor
integer :: i, j, k
! precompute factors
do i = 1, simd
vecx(i)=x**(i-1)
end do
xfactor = x**simd
tmp = 0.0d0
do i = 1, nx, simd
do k = 1, simd
tmp(k) = tmp(k)*xfactor + p(nx-(i+k-1)+1)*vecx(simd-k+1)
end do
end do
pval = sum(tmp)
end subroutine polyval
subroutine polyval2(p, x, y, pval, nx, ny)
implicit none
real*8, dimension(nx, ny), intent(in) :: p
real*8, intent(in) :: x, y
real*8, intent(out) :: pval
integer, intent(in) :: nx, ny
integer, parameter :: simd = 8
real*8 :: tmp(simd), vecx(simd), xfactor
integer :: i, j, k
! precompute factors
do i = 1, simd
vecx(i)=x**(i-1)
end do
xfactor = x**simd
! horner
pval=0.0d0
do i = 1, ny
tmp = 0.0d0
do j = 1, nx, simd
! inner vectorizable loop
do k = 1, simd
tmp(k) = tmp(k)*xfactor + p(nx-(j+k-1)+1,ny-i+1)*vecx(simd-k+1)
end do
end do
pval = pval*y + sum(tmp)
end do
end subroutine polyval2
更新2 :正如所指出的,此代码不正确,至少在simd
不能整除大小时。它只是展示了手动帮助编译器的概念,所以不要像这样使用它。如果大小不是2的幂,则小的余数循环必须处理悬空指数。这样做并不困难,这是单变量情况的正确程序,应该直接将其扩展到双变量:
subroutine polyval(p, x, pval, nx)
implicit none
real*8, dimension(nx), intent(in) :: p
real*8, intent(in) :: x
real*8, intent(out) :: pval
integer, intent(in) :: nx
integer, parameter :: simd = 4
real*8 :: tmp(simd), vecx(simd), xfactor
integer :: i, j, k, nr
! precompute factors
do i = 1, simd
vecx(i)=x**(i-1)
end do
xfactor = x**simd
! check remainder
nr = mod(nx, simd)
! horner
tmp = 0.0d0
do i = 1, nx-nr, simd
do k = 1, simd
tmp(k) = tmp(k)*xfactor + p(nx-(i+k-1)+1)*vecx(simd-k+1)
end do
end do
pval = sum(tmp)
! do remainder
pval = pval * x**nr
do i = 1, nr
pval = pval + p(i) * vecx(i)
end do
end subroutine polyval
此外,人们应该小心尺寸非常小,因为时间太小,无法获得准确的性能。此外,相对于numpy
的相对时间可能是欺骗性的,因为numpy的绝对时间可能非常糟糕。以下是最大案例的时间安排:
对于nx = 2 20的单变量,numpy的时间为1.21 s,自定义fortran版本的时间为1.69e-3 s。对于nx ny = 2 20的双变量,numpy的时间为8e-3 s,自定义版本的时间为1.68e-3 s。当总nx ny大小相同时,单变量和双变量的时间是相同的这一事实是非常重要的,因为它支持代码在内存带宽限制附近执行的事实。
更新3 :使用适用于较小尺寸的新python脚本,simd=4
我获得以下效果:
更新4 :至于正确性,结果在双精度精度内是相同的,如果为单变量示例运行此python代码,可以看到:
import polynomial as P
import numpy.polynomial.polynomial as PP
import numpy as np
for n in xrange(2,100):
poly1n = np.random.rand(n)
poly1f = np.asfortranarray(poly1n)
x = 2
print "%18.14e" % P.polyval(poly1f, x)
print "%18.14e" % PP.polyval(x, poly1n)
print (P.polyval(poly1f, x) - PP.polyval(x, poly1n))/PP.polyval(x,poly1n), '\n'