在cython中使用两个数组作为输入来矢量化类的函数

时间:2015-04-26 15:57:21

标签: python arrays numpy vectorization cython

我正在努力解决优化cython代码的问题,以尽可能提高速度。我无法弄清楚应该如何在cython中完成的挑战之一是将数组映射到函数上,就像numpy.vectorize函数中所做的那样。

我的问题的简化版本是

from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
cdef class Test(object):
    cdef public double M, c, z
    cdef public double[::1] ks, zs, pos

    @cython.boundscheck(False)
    @cython.cdivision(True)
    @cython.wraparound(False)
    @cython.nonecheck(False)
    def __cinit__(self, M, c, z, pos, ks, zs=None):


        if path is None:
           raise ValueError("Could not find a path to the file which contains the table of angular diameter distances")

        self.M = M
        self.c = c
        self.z = z
        self.pos = pos

        if zs is None:
           raise ValueError("You must give an array which contains the steps where the redshift probability distribution are computed!")
        self.zs=zs
        self.ks=ks
    @cython.cdivision(True)    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    cpdef np.ndarray[double, ndim=1, mode='c'] __kappa(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
        cdef Py_ssize_t N = x.shape[0]
        cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask


        cdef np.ndarray[double, ndim=1, mode='c'] out  = np.zeros(N, dtype=np.float64 , order='C')

        mask = np.where(x < 0.999)[0]
        out[mask] = 2*ks/(x[mask]**2 - 1) * \
                (1 - np.log((1 + ((1 - x[mask])/(x[mask] + 1))**0.5)/(1 - ((1 - x[mask])/(x[mask] + 1))**0.5))/(1 - x[mask]**2)**0.5)

        mask = np.where(x > 1.001)[0]
        out[mask] = 2*ks/(x[mask]**2 - 1) * \
                (1 - 2*np.arctan(((x[mask] - 1)/(x[mask] + 1))**0.5)/(x[mask]**2 - 1)**0.5)


        mask = np.where((x >= 0.999) & (x <= 1.001))[0]
        out[mask] = ks*(22./15. - 0.8*x[mask])           

        return out

    @cython.cdivision(True)    
    @cython.boundscheck(False)
    @cython.wraparound(False)
    cpdef np.ndarray[double, ndim=1, mode='c'] __gamma(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
        cdef Py_ssize_t N=len(x)
        cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask 
        cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')


        mask = np.where(x > 0.01)[0]
        out[mask] = 4*ks*(np.log(x[mask]/2) + 2* \
                x[mask]**(-2) - self.__kappa(x[mask], ks)


        mask = np.where(x <= 0.01)[0]
        out[mask] = 4*ks*(0.25 + 0.125 * x[mask]**2 * (3.25 + 3.0*np.log(x[mask]/2)))

        return out

    cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
        # Convert to numpy arrays for internal usage:
        cdef np.ndarray[double, ndim=1, mode='c'] g, kappa, r, ks, wg
        cdef np.ndarray[double, ndim=1, mode='c'] pos_x, pos_y 
        if not gpos[:,0].flags.c_contiguous:
           pos_x = gpos[:,0].copy(order='C')
        else:
           pos_x = gpos[:,0]
        if not gpos[:,1].flags.c_contiguous:
           pos_y = gpos[:,1].copy(order='C')
        else:
           pos_y = gpos[:,1]
        cdef Py_ssize_t i, mask, N

        r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5

        ks  = np.ascontiguousarray(self.ks)
        N   = len(ks)
        mask= np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]

        wg  = np.zeros(len(r), dtype=np.float64 , order='C')

        for i from N > i >= 0:  
            g = self.__gamma(r, ks[i])

            kappa = self.__kappa(r, ks[i])
            g /= 1 - kappa
            wg+=g*pdf_z[:,mask+i]

        cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
        dx = pos_x - self.halo_pos[0]
        dy = pos_y - self.halo_pos[1]
        drsq = dx*dx+dy*dy
        drsq[drsq==0.] = 1. # Avoid division by 0
        cos2phi = (dx*dx-dy*dy)/drsq
        sin2phi = 2*dx*dy/drsq
        g1 = -wg*cos2phi
        g2 = -wg*sin2phi

        return g1, g2

我想知道是否有一种方法可以对getShTest类的ks方法进行矢量化,并避免使用能使代码更快的东西来使用循环?< / p>

2 个答案:

答案 0 :(得分:2)

如果您可以将整个数组ks传递给方法self.__gamma()self.__kappa(),则可以完成代码的矢量化,从而防止每次循环迭代时函数调用的开销将循环移动到最内部调用的方法。

还有一些其他技巧可以提供额外的性能:

  • 仅在循环外计算一次掩码,因为它们与数组r
  • 相关
  • mask = x > 0.01代替mask = np.where(x > 0.01)[0]和similars
  • 重复使用out数组,因为它始终具有length = N

编辑: 在将上述想法付诸实践后,我提出了以下解决方案,其中不再需要方法__kappa()__gamma()。它没有经过测试:

cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
    # Convert to numpy arrays for internal usage:
    cdef np.ndarray[double, ndim=1] r, ks, wg
    cdef np.ndarray[double, ndim=1] pos_x, pos_y
    cdef np.ndarray[double, ndim=2, mode='c'] gamma, kappa, wgtmp

    if not gpos[:,0].flags.c_contiguous:
       pos_x = gpos[:,0].copy(order='C')
    else:
       pos_x = gpos[:,0]
    if not gpos[:,1].flags.c_contiguous:
       pos_y = gpos[:,1].copy(order='C')
    else:
       pos_y = gpos[:,1]
    cdef Py_ssize_t i, mask, N

    r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5

    m1 = r > 0.01
    m2 = ~m1
    r1 = r[m1]
    r2 = r[m2]

    ma = r < 0.999
    mb = (r >= 0.999) & (r <= 1.001)
    mc = r > 1.001
    ra = r[ma]
    rb = r[mb]
    rc = r[mc]

    ks  = np.ascontiguousarray(self.ks)
    one = np.ones_like(ks)
    N = len(ks)
    P = len(r)

    kappa = np.zeros((P, N), dtype=np.float64 , order='C')
    gamma = np.zeros((P, N), dtype=np.float64 , order='C')
    wgtmp = np.zeros((P, N), dtype=np.float64 , order='C')
    wg = np.zeros((P,), dtype=np.float64)

    kappa[ma] = (2*ks/(ra**2 - 1)[:, None] *
                 one*(1 - np.log((1 + ((1 - ra)/(ra + 1))**0.5)/(1 - ((1 -
                      ra)/(ra + 1))**0.5))/(1 - ra**2)**0.5)[:, None])

    kappa[mb] = ks*(22./15. - 0.8*rb)[:, None]

    kappa[mc] = (2*ks/(rc**2 - 1)[:, None] *
                 one*(1 - 2*np.arctan(((rc - 1)/(rc + 1))**0.5)/(rc**2 -
                     1)**0.5)[:, None])


    gamma[m1 & ma] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[ma])[:, None]
    gamma[m1 & mb] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mb])[:, None]
    gamma[m1 & mc] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mc])[:, None]

    gamma[m2] = 4*ks*(0.25 + 0.125 * r2**2 * (3.25 + 3.0*np.log(r2/2)))[:, None]


    init = np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]

    wgtmp = gamma/(1-kappa) * pdf_z[:, init:init+N]
    wg = wgtmp.sum(axis=1)

    cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
    dx = pos_x - self.halo_pos[0]
    dy = pos_y - self.halo_pos[1]
    drsq = dx*dx+dy*dy
    drsq[drsq==0.] = 1. # Avoid division by 0
    cos2phi = (dx*dx-dy*dy)/drsq
    sin2phi = 2*dx*dy/drsq
    g1 = -wg*cos2phi
    g2 = -wg*sin2phi

    return g1, g2

答案 1 :(得分:0)

我不认为'vectorize'适用于cython。在numpy中,您可以使用快速编译的代码进行向量化,例如+*sum等操作。有一个np.vectorize函数,但它只是将代码包装在一个理解广播和多维数组的迭代器中。它不会重写你的功能,也不会加快它的速度。

Cython用于加速无法在现有编译向量操作中表达的numpy代码。它通过编译快速C迭代来获得速度。

表面上i中的getSh循环看起来很快(c样式),但它会调用self.__kappaself.__gamma。两者都加载np来电 - np.arraynp.wherenp.log等。通过这些来电,您无法获得纯粹的{ {1}}代码。

您需要关注这两种方法,将它们表示为对数字的简单操作,并根据需要显式迭代c样式。