我在cython中重写了我的python循环,希望速度有很大提高。 我只得到四分之一。难道我做错了什么? 这是没有cython的代码:
import numpy as np
import itertools as itr
import math
def Pk (b, f, mu, k): # k is in Mpc
isoPk = 200*math.exp(-(k-0.02)**2/2/0.01**2) # Isotropic power spectrum
power = (b+mu**2*f)**2*isoPk
return power
def Gendk (N, kvec, Pk, b, f, deltak3d):
Nhalf = int(N/2)
for xx, yy, zz in itr.product(range(0,N), range(0,N), range(0,Nhalf+1)):
kx = kvec[xx]
ky = kvec[yy]
kz = kvec[zz]
kk = math.sqrt(kx**2+ky**2+kz**2)
if kk == 0:
continue
mu = kz/kk
power = Pk(b, f, mu, kk)
if power==0:
deltaRe = 0
deltaIm = 0
else:
deltaRe = np.random.normal(0, power/2.0)
if (xx==0 or xx==Nhalf) and (yy==0 or yy==Nhalf) and (zz==0 or zz==Nhalf):
deltaIm = 0
else:
deltaIm = np.random.normal(0, power/2.0)
x_conj = (2*N-xx)%N
y_conj = (2*N-yy)%N
z_conj = (2*N-zz)%N
deltak3d[xx,yy,zz] = deltaRe + deltaIm*1j
deltak3d[x_conj,y_conj,z_conj] = deltaRe - deltaIm*1j
Ntot = 300000
L = 1000
N = 128
Nhalf = int(N/2)
kmax = 5.0
dk = kmax/N
kvec = np.fft.fftfreq(N, L/N)
dL = L/N
deltak3d = np.zeros((N,N,N), dtype=complex)
deltak3d[0,0,0] = Ntot
Gendk(N, kvec, Pk, 2, 1, deltak3d)
这是cython的版本:
import numpy as np
import pyximport; pyximport.install(setup_args={"include_dirs":np.get_include()})
import testGauss as tG
Ntot = 300000
L = 1000
N = 128
Nhalf = int(N/2)
kmax = 5.0
dk = kmax/N
kvec = np.fft.fftfreq(N, L/N)
dL = L/N
deltak3d = np.zeros((N,N,N), dtype=complex)
deltak3d[0,0,0] = Ntot
tG.Gendk(N, kvec, tG.Pk, 2, 1, deltak3d)
并且testGauss.pyx文件是:
import math
import numpy as np
cimport numpy as np
import itertools as itr
def Pk (double b, double f, double mu, double k): # k is in Mpc
cdef double isoPk, power
isoPk = 200*math.exp(-(k-0.02)**2/2/0.01**2) # Isotropic power spectrum
power = (b+mu**2*f)**2*isoPk
return power
def Gendk (int N, np.ndarray[np.float64_t,ndim=1] kvec, Pk, double b, double f, np.ndarray[np.complex128_t,ndim=3] deltak3d):
cdef int Nhalf = int(N/2)
cdef int xx, yy, zz
cdef int x_conj, y_conj, z_conj
cdef double kx, ky, kz, kk
cdef mu
cdef power
cdef deltaRe, deltaIm
for xx, yy, zz in itr.product(range(0,N), range(0,N), range(0,Nhalf+1)):
kx = kvec[xx]
ky = kvec[yy]
kz = kvec[zz]
kk = math.sqrt(kx**2+ky**2+kz**2)
if kk == 0:
continue
mu = kz/kk
power = Pk(b, f, mu, kk)
if power==0:
deltaRe = 0
deltaIm = 0
else:
deltaRe = np.random.normal(0, power/2.0)
if (xx==0 or xx==Nhalf) and (yy==0 or yy==Nhalf) and (zz==0 or zz==Nhalf):
deltaIm = 0
else:
deltaIm = np.random.normal(0, power/2.0)
x_conj = (2*N-xx)%N
y_conj = (2*N-yy)%N
z_conj = (2*N-zz)%N
deltak3d[xx,yy,zz] = deltaRe + deltaIm*1j
deltak3d[x_conj,y_conj,z_conj] = deltaRe - deltaIm*1j
非常感谢你!
答案 0 :(得分:2)
使用cProfile来分析您的Python代码。也许CPU密集度最高的任务已经在NumPy中。然后从Cython中获得的并不是那么多。
答案 1 :(得分:2)
你可以通过替换
获得一些加速import math
与
from libc cimport math
当你执行sqrt和exp时,这将避免python函数调用,用直接c调用替换它(这应该快得多)。
我也稍微关注你的循环中对np.random.normal的调用,每次都会增加一个合理的python开销。在循环之前调用它可能会更快,以生成大量随机数(带有单个python调用的开销),然后如果在循环内部不需要它们则用0覆盖它们。
优化Cython的一般建议仍然适用:运行
cython -a your_file.pyx
查看HTML,并担心突出显示为黄色的位(但只有在经常调用它们时)
答案 2 :(得分:2)
使用Pythran在本机模块中转换代码(稍加修改)可以获得x50加速。
import numpy as np
import itertools as itr
import math
from random import gauss as normal
def Pk (b, f, mu, k): # k is in Mpc
isoPk = 200*math.exp(-(k-0.02)**2/2/0.01**2) # Isotropic power spectrum
power = (b+mu**2*f)**2*isoPk
return power
#pythran export Gendk(int, float[], int, int, complex[][][])
def Gendk (N, kvec, b, f, deltak3d):
Nhalf = int(N/2)
for xx, yy, zz in itr.product(range(0, N), range(0, N), range(0, Nhalf+1)):
kx = kvec[xx]
ky = kvec[yy]
kz = kvec[zz]
kk = math.sqrt(kx**2+ky**2+kz**2)
if kk == 0:
continue
mu = kz/kk
power = Pk(b, f, mu, kk)
if power == 0:
deltaRe = 0
deltaIm = 0
else:
# deltaRe = np.random.normal(0, power/2.0)
deltaRe = normal(0, power/2.0)
if (xx == 0 or xx == Nhalf) and (yy == 0 or yy == Nhalf) and (zz == 0 or zz == Nhalf):
deltaIm = 0
else:
#deltaIm = np.random.normal(0, power/2.0)
deltaIm = normal(0, power/2.0)
x_conj = (2*N-xx)%N
y_conj = (2*N-yy)%N
z_conj = (2*N-zz)%N
deltak3d[xx, yy, zz] = deltaRe + deltaIm*1j
deltak3d[x_conj, y_conj, z_conj] = deltaRe - deltaIm*1j
编译:
$ pythran tg.py
并测试:
$ python -m timeit -s 'import numpy as np; Ntot = 30000; L = 1000; N = 12; Nhalf = int(N/2); kmax = 5.0; dk = kmax/N; kvec = np.fft.fftfreq(N, L/N); dL = L/N; deltak3d = np.zeros((N, N, N), dtype=complex); deltak3d[0, 0, 0] = Ntot; from tg import Gendk' 'Gendk(N, kvec, 2, 1, deltak3d)'
我为CPython运行获得10 loops, best of 3: 29.4 msec per loop
,为Pythran运行获得1000 loops, best of 3: 587 usec per loop
。
免责声明:我是Pythran dev 。