我用Python编写了以下代码。 如果我使用此代码,则需要1分20秒:
def calc_cy(searchX,searchY,searchZ,dx,dz,r_vector,p_true,a_factor,sim):
calc1 = np.full((len(searchX),len(searchY),len(searchZ)),np.nan)
z_t = np.abs(np.ceil(r_vector[:,2]/dz).squeeze())
for ix in searchX:
for iy in searchY:
for iz in searchZ:
dt=[]
ir = (np.floor((((ix-r_vector[:,0])**2+(iy-r_vector[:,1])**2))**0.5))
for ip in range(p_true.size):
dt.append(sim[int(iz)][int(ir[ip]),int(z_t[ip])])
dt_sim = dt-min(dt)
dt_true=p_true-min(p_true[0])
calc1[int(ix),int(iy),int(iz)]=np.linalg.norm(a_factor*(dt_sim-dt_true))
return calc1
如果我将此代码与Cython一起使用,则需要1分5秒:
设置代码:
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(ext_modules = cythonize("calc_cy.pyx"), include_dirs = [numpy.get_include()])
使用Cython的代码:
import numpy as np
cimport numpy as np
import math
DTYPE = np.double
ctypedef np.double_t DTYPE_t
def calc_cy(np.ndarray [DTYPE_t,ndim = 3] sim,np.ndarray [DTYPE_t,ndim = 1] searchX,np.ndarray [DTYPE_t,ndim = 1] searchY,np.ndarray [DTYPE_t,ndim = 1] searchZ,np.ndarray [DTYPE_t,ndim = 2] r_vector,np.ndarray [DTYPE_t,ndim = 1] p_true,np.ndarray [DTYPE_t,ndim = 1] a_factor):
cdef np.ndarray[DTYPE_t,ndim = 3] calc1 = np.zeros((len(searchX),len(searchY),len(searchZ)),dtype = DTYPE)
cdef np.ndarray[DTYPE_t,ndim = 1] dt2 = np.zeros(len(p_true),dtype = DTYPE)
cdef np.ndarray[DTYPE_t,ndim = 1] dt_sim = np.zeros(len(dt2),dtype = DTYPE)
cdef np.ndarray[DTYPE_t,ndim = 1] dt_picks = np.zeros(len(p_true),dtype = DTYPE)
cdef int ir
cdef int k
cdef int m
for ix in range(len(searchX)):
for iy in range(len(searchY)):
for iz in range(len(searchZ)):
for ip in range(len(p_true)):
ir= int(np.floor((((searchX[ix]-r_vector[ip,0])**2+(searchY[iy]-r_vector[ip,1])**2))**0.5))
k = int(searchZ[iz])
m = int(math.fabs(math.ceil(r_vector[ip,2])))
dt2[int(ip)] = sim[k,ir,m]
dt_sim = dt2- min(dt2)
dt_true = p_true - min(p_true)
calc1[ix,iy,iz] = np.linalg.norm(a_factor*(dt_sim-dt_true))
return calc1
如何改善代码并使其更高效? 谢谢!
答案 0 :(得分:0)
看看您的代码,ir
的值不依赖于iz
,因此您可以将其移到iz
循环之外。
dt_true
的值仅取决于p_true
,因此只需一次进行计算。
这些结合起来将消除不必要的重新计算,从而加快了代码的速度。 (这称为loop-invariant code motion。)
此外,您可以将dt
的计算转换为列表推导。那应该可以加快Python版本的速度。不过,不确定是否会帮助Cython版本。
这会使代码看起来像这样:
def calc_cy(searchX,searchY,searchZ,dx,dz,r_vector,p_true,a_factor,sim):
calc1 = np.full((len(searchX),len(searchY),len(searchZ)),np.nan)
z_t = np.abs(np.ceil(r_vector[:,2]/dz).squeeze())
dt_true = p_true - min(p_true[0])
for ix in searchX:
for iy in searchY:
ir = np.floor(((ix-r_vector[:,0])**2+(iy-r_vector[:,1])**2)**0.5)
for iz in searchZ:
dt = [sim[int(iz)][int(ir[ip]),int(z_t[ip])] for ip in range(p_true.size)]
dt_sim = dt-min(dt)
calc1[int(ix),int(iy),int(iz)]=np.linalg.norm(a_factor*(dt_sim-dt_true))
return calc1
假设您具有多核CPU,则可以生成searchX
和searchY
的所有可能组合的元组。然后,使用multiprocessing.Pool
或concurrent.futures.ProcessPoolExecutor
,可以将searchY
循环内的计算划分到所有内核上,因为它们是独立的。如果您有 n 个内核,则应将运行时间减少大约 n 倍。