作为大量代码的一部分,我需要使用不同的参数调用(简化的)函数example
(粘贴在下面)多次(数十万次)。因此,我需要这个模块快速运行。
模块的主要问题似乎是多个嵌套循环。但是,我不确定是否存在与这些循环相关的实际不必要的开销(如编写的那样),或者如果代码真的那么快就可以获得。
通常,在cython中处理多个嵌套for循环时,是否存在可用于减少开销和加速代码的循环优化技术?这些技术中的任何一种都适用于下面粘贴的示例代码吗?
我也正在用extra_compile_args=["-ffast-math",'-O3']
编译cython,尽管这似乎并没有产生很大的影响。
如果这段代码在cython中真的不能更快(我希望不是这样),那么用C或Fortran写这个模块的全部或部分会有什么好处吗?
import numpy as np
import math
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cdef extern from "math.h":
double log(double x) nogil
double exp(double x) nogil
double pow(double x, double y) nogil
def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):
cdef int k_max = np.max(data) + 1
cdef double A = np.float(theta[0])
cdef double n1 = np.float(theta[1])
cdef double n2 = np.float(theta[2])
cdef double Sb = np.float(theta[3])
cdef int npixROI = len(xbg_PSF_compressed)
cdef double f2 = 0.0
cdef double df_rho_div_f2 = 0.0
cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
cdef Py_ssize_t f_index, p, k, n
#calculations for PS
cdef int do_half = 0
cdef double term1 = 0.0
cdef double term2 = 0.0
cdef double second_2_a = 0.0
cdef double second_2_b = 0.0
cdef double second_2_c = 0.0
cdef double second_2_d = 0.0
cdef double second_1_a = 0.0
cdef double second_1_b = 0.0
cdef double second_1_c = 0.0
cdef double second_1_d = 0.0
for f_index in range(len(f_ary)):
f2 = f_ary[f_index]
df_rho_div_f2 = df_rho_div_f_ary[f_index]
g1_ary_f = np.random.random(k_max+1)
g2_ary_f = np.random.random(k_max+1)
term1 = (A * Sb * f2) \
* (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
- (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
second_1_a = A * pow(Sb * f2, n1)
second_1_b = A * pow(Sb * f2, n2)
for p in range(npixROI):
x_m_sum_f[p] = term1 * PS_dist_compressed[p]
x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]
second_1_c = second_1_a * PS_dist_compressed[p]
second_1_d = second_1_b * PS_dist_compressed[p]
for k in range(data[p]+1):
x_m_ary_f[k,p] = second_1_c * g1_ary_f[k] + second_1_d * g2_ary_f[k]
x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]
cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)
cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE)
cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)
cdef double ll = 0.
for p in range(npixROI):
f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
nu_mat[0,p] = exp(f0_ary[p])
nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]
for k in range(2,data[p]+1):
for n in range(0, k - 1):
nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
ll+=log( nu_mat[data[p],p])
if math.isnan(ll) ==True or math.isinf(ll) ==True:
ll = -10.1**10.
return ll
作为参考,在尝试运行此代码时,示例参数为
f_ary=np.array([ 0.05, 0.15, 0.25 , 0.35 , 0.45 ,0.55 , 0.65 , 0.75, 0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928, 2.83852471 , 1.14224844 , 0.61687863 , 0.39948536,
0.30138642 , 0.24715899 , 0.22077999 , 0.21594814 , 0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20
PS_dist_compressed= np.ones(n_p)
然后可以将该示例称为example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed)
。对于计时,我发现此示例的计算结果为〜10 loops, best of 3: 147 ms per loop
。由于完整代码需要数小时才能运行,因此运行时间的任何减少都会对性能产生很大的影响。