Question

作为大量代码的一部分，我需要使用不同的参数调用（简化的）函数example（粘贴在下面）多次（数十万次）。因此，我需要这个模块快速运行。

模块的主要问题似乎是多个嵌套循环。但是，我不确定是否存在与这些循环相关的实际不必要的开销（如编写的那样），或者如果代码真的那么快就可以获得。

通常，在cython中处理多个嵌套for循环时，是否存在可用于减少开销和加速代码的循环优化技术？这些技术中的任何一种都适用于下面粘贴的示例代码吗？

我也正在用extra_compile_args=["-ffast-math",'-O3']编译cython，尽管这似乎并没有产生很大的影响。

如果这段代码在cython中真的不能更快（我希望不是这样），那么用C或Fortran写这个模块的全部或部分会有什么好处吗？

import numpy as np
import math
cimport numpy as np
cimport cython

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cdef extern from "math.h":
    double log(double x) nogil
    double exp(double x) nogil
    double pow(double x, double y) nogil


def example(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc = 1000.0):
    return example_int(xbg_PSF_compressed,theta, f_ary, df_rho_div_f_ary, PS_dist_compressed, data, Sc)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
cdef double example_int(double[::1] xbg_PSF_compressed, double[::1] theta, double[::1] f_ary, double[::1] df_rho_div_f_ary, double[::1] PS_dist_compressed, int[::1] data, double Sc ):

    cdef int k_max = np.max(data) + 1

    cdef double A = np.float(theta[0])
    cdef double n1 = np.float(theta[1])
    cdef double n2 = np.float(theta[2])
    cdef double Sb = np.float(theta[3])

    cdef int npixROI = len(xbg_PSF_compressed)

    cdef double f2 = 0.0
    cdef double df_rho_div_f2 = 0.0


    cdef double[:,::1] x_m_ary = np.zeros((k_max + 1,npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum = np.zeros(npixROI, dtype=DTYPE)
    cdef double[:,::1] x_m_ary_f = np.zeros((k_max + 1, npixROI), dtype=DTYPE)
    cdef double[::1] x_m_sum_f = np.zeros(npixROI, dtype=DTYPE)

    cdef double[::1] g1_ary_f = np.zeros(k_max + 1, dtype=DTYPE)
    cdef double[::1] g2_ary_f = np.zeros(k_max + 1, dtype=DTYPE)

    cdef Py_ssize_t f_index, p, k, n


    #calculations for PS

    cdef int do_half = 0

    cdef double term1 = 0.0
    cdef double term2 = 0.0
    cdef double second_2_a = 0.0
    cdef double second_2_b = 0.0
    cdef double second_2_c = 0.0
    cdef double second_2_d = 0.0
    cdef double second_1_a = 0.0
    cdef double second_1_b = 0.0
    cdef double second_1_c = 0.0
    cdef double second_1_d = 0.0



    for f_index in range(len(f_ary)):
        f2 = f_ary[f_index]
        df_rho_div_f2 = df_rho_div_f_ary[f_index]
        g1_ary_f = np.random.random(k_max+1)
        g2_ary_f = np.random.random(k_max+1)
        term1 = (A * Sb * f2) \
                             * (1./(n1-1.) + 1./(1.-n2) - pow(Sb / Sc, n1-1.)/(n1-1.) \
                                - (pow(Sb * f2, n1-1.) * g1_ary_f[0] + pow(Sb * f2, n2-1.) * g2_ary_f[0]))
        second_1_a =  A  * pow(Sb * f2, n1)
        second_1_b = A * pow(Sb * f2, n2)

        for p in range(npixROI):
            x_m_sum_f[p] = term1 * PS_dist_compressed[p]
            x_m_sum[p] += df_rho_div_f2*x_m_sum_f[p]

            second_1_c = second_1_a * PS_dist_compressed[p]
            second_1_d = second_1_b * PS_dist_compressed[p]
            for k in range(data[p]+1):            
                x_m_ary_f[k,p] = second_1_c  * g1_ary_f[k] + second_1_d * g2_ary_f[k] 
                x_m_ary[k,p] += df_rho_div_f2*x_m_ary_f[k,p]

    cdef double[::1] nu_ary = np.zeros(k_max + 1, dtype=DTYPE)

    cdef double[::1] f0_ary = np.zeros(npixROI, dtype=DTYPE) 
    cdef double[::1] f1_ary = np.zeros(npixROI, dtype=DTYPE) 

    cdef double[:,::1] nu_mat = np.zeros((k_max+1, npixROI), dtype=DTYPE)


    cdef double ll = 0.

    for p in range(npixROI):
        f0_ary[p] = -(xbg_PSF_compressed[p] + x_m_sum[p])
        f1_ary[p] = (xbg_PSF_compressed[p] + x_m_ary[1,p])
        nu_mat[0,p] = exp(f0_ary[p])
        nu_mat[1,p] = nu_mat[0,p] * f1_ary[p]

        for k in range(2,data[p]+1):
            for n in range(0, k - 1):
                nu_mat[k,p] += (k-n)/ float(k) * x_m_ary[k-n,p] * nu_mat[n,p]
            nu_mat[k,p] += f1_ary[p] * nu_mat[k-1,p] / float(k)
        ll+=log( nu_mat[data[p],p])

    if math.isnan(ll) ==True or math.isinf(ll) ==True:
        ll = -10.1**10.

    return ll

作为参考，在尝试运行此代码时，示例参数为

f_ary=np.array([ 0.05,  0.15,  0.25 , 0.35 , 0.45  ,0.55 , 0.65 , 0.75,  0.85 , 0.95])
df_rho_div_f_ary = np.array([ 24.27277928,   2.83852471 ,  1.14224844 ,  0.61687863  , 0.39948536,
   0.30138642 ,  0.24715899 ,  0.22077999 ,  0.21594814 ,  0.19035121])
theta=[.002, 3.01,0.01, 10.013]
n_p=1000
data= np.random.randint(1,400,n_p).astype(dtype='int32')
k_max=int(np.max(data))+1
xbg_PSF_compressed= np.ones(n_p)*20 
PS_dist_compressed= np.ones(n_p)

然后可以将该示例称为example(k_max,xbg_PSF_compressed,theta,f_ary,df_rho_div_f_ary, PS_dist_compressed)。对于计时，我发现此示例的计算结果为〜10 loops, best of 3: 147 ms per loop。由于完整代码需要数小时才能运行，因此运行时间的任何减少都会对性能产生很大的影响。

cython中的嵌套循环优化：有没有更快的方法来设置这个代码示例？

0 个答案: