我正在努力解决优化cython
代码的问题,以尽可能提高速度。我无法弄清楚应该如何在cython
中完成的挑战之一是将数组映射到函数上,就像numpy.vectorize
函数中所做的那样。
我的问题的简化版本是
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
cdef class Test(object):
cdef public double M, c, z
cdef public double[::1] ks, zs, pos
@cython.boundscheck(False)
@cython.cdivision(True)
@cython.wraparound(False)
@cython.nonecheck(False)
def __cinit__(self, M, c, z, pos, ks, zs=None):
if path is None:
raise ValueError("Could not find a path to the file which contains the table of angular diameter distances")
self.M = M
self.c = c
self.z = z
self.pos = pos
if zs is None:
raise ValueError("You must give an array which contains the steps where the redshift probability distribution are computed!")
self.zs=zs
self.ks=ks
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __kappa(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N = x.shape[0]
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x < 0.999)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - np.log((1 + ((1 - x[mask])/(x[mask] + 1))**0.5)/(1 - ((1 - x[mask])/(x[mask] + 1))**0.5))/(1 - x[mask]**2)**0.5)
mask = np.where(x > 1.001)[0]
out[mask] = 2*ks/(x[mask]**2 - 1) * \
(1 - 2*np.arctan(((x[mask] - 1)/(x[mask] + 1))**0.5)/(x[mask]**2 - 1)**0.5)
mask = np.where((x >= 0.999) & (x <= 1.001))[0]
out[mask] = ks*(22./15. - 0.8*x[mask])
return out
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double, ndim=1, mode='c'] __gamma(self, np.ndarray[double, ndim=1, mode='c'] x, double ks):
cdef Py_ssize_t N=len(x)
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] mask
cdef np.ndarray[double, ndim=1, mode='c'] out = np.zeros(N, dtype=np.float64 , order='C')
mask = np.where(x > 0.01)[0]
out[mask] = 4*ks*(np.log(x[mask]/2) + 2* \
x[mask]**(-2) - self.__kappa(x[mask], ks)
mask = np.where(x <= 0.01)[0]
out[mask] = 4*ks*(0.25 + 0.125 * x[mask]**2 * (3.25 + 3.0*np.log(x[mask]/2)))
return out
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1, mode='c'] g, kappa, r, ks, wg
cdef np.ndarray[double, ndim=1, mode='c'] pos_x, pos_y
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
ks = np.ascontiguousarray(self.ks)
N = len(ks)
mask= np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wg = np.zeros(len(r), dtype=np.float64 , order='C')
for i from N > i >= 0:
g = self.__gamma(r, ks[i])
kappa = self.__kappa(r, ks[i])
g /= 1 - kappa
wg+=g*pdf_z[:,mask+i]
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2
我想知道是否有一种方法可以对getSh
类Test
类的ks
方法进行矢量化,并避免使用能使代码更快的东西来使用循环?< / p>
答案 0 :(得分:2)
如果您可以将整个数组ks
传递给方法self.__gamma()
和self.__kappa()
,则可以完成代码的矢量化,从而防止每次循环迭代时函数调用的开销将循环移动到最内部调用的方法。
还有一些其他技巧可以提供额外的性能:
r
mask = x > 0.01
代替mask = np.where(x > 0.01)[0]
和similars out
数组,因为它始终具有length = N
编辑:
在将上述想法付诸实践后,我提出了以下解决方案,其中不再需要方法__kappa()
和__gamma()
。它没有经过测试:
cpdef tuple getSh(self, np.ndarray[double, ndim=2, mode='c'] gpos, np.ndarray[double, ndim=2, mode='c'] pdf_z):
# Convert to numpy arrays for internal usage:
cdef np.ndarray[double, ndim=1] r, ks, wg
cdef np.ndarray[double, ndim=1] pos_x, pos_y
cdef np.ndarray[double, ndim=2, mode='c'] gamma, kappa, wgtmp
if not gpos[:,0].flags.c_contiguous:
pos_x = gpos[:,0].copy(order='C')
else:
pos_x = gpos[:,0]
if not gpos[:,1].flags.c_contiguous:
pos_y = gpos[:,1].copy(order='C')
else:
pos_y = gpos[:,1]
cdef Py_ssize_t i, mask, N
r = ((pos_x - self.pos[0])**2 + (pos_y - self.pos[1])**2)**0.5
m1 = r > 0.01
m2 = ~m1
r1 = r[m1]
r2 = r[m2]
ma = r < 0.999
mb = (r >= 0.999) & (r <= 1.001)
mc = r > 1.001
ra = r[ma]
rb = r[mb]
rc = r[mc]
ks = np.ascontiguousarray(self.ks)
one = np.ones_like(ks)
N = len(ks)
P = len(r)
kappa = np.zeros((P, N), dtype=np.float64 , order='C')
gamma = np.zeros((P, N), dtype=np.float64 , order='C')
wgtmp = np.zeros((P, N), dtype=np.float64 , order='C')
wg = np.zeros((P,), dtype=np.float64)
kappa[ma] = (2*ks/(ra**2 - 1)[:, None] *
one*(1 - np.log((1 + ((1 - ra)/(ra + 1))**0.5)/(1 - ((1 -
ra)/(ra + 1))**0.5))/(1 - ra**2)**0.5)[:, None])
kappa[mb] = ks*(22./15. - 0.8*rb)[:, None]
kappa[mc] = (2*ks/(rc**2 - 1)[:, None] *
one*(1 - 2*np.arctan(((rc - 1)/(rc + 1))**0.5)/(rc**2 -
1)**0.5)[:, None])
gamma[m1 & ma] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[ma])[:, None]
gamma[m1 & mb] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mb])[:, None]
gamma[m1 & mc] = 4*ks*(np.log(r1/2) + 2*r1**(-2) - kappa[mc])[:, None]
gamma[m2] = 4*ks*(0.25 + 0.125 * r2**2 * (3.25 + 3.0*np.log(r2/2)))[:, None]
init = np.where(np.ascontiguousarray(self.zs)>(self.z+0.1))[0][0]
wgtmp = gamma/(1-kappa) * pdf_z[:, init:init+N]
wg = wgtmp.sum(axis=1)
cdef np.ndarray[double, ndim=1, mode='c'] dx, dy, drsq, cos2phi, sin2phi, g1, g2
dx = pos_x - self.halo_pos[0]
dy = pos_y - self.halo_pos[1]
drsq = dx*dx+dy*dy
drsq[drsq==0.] = 1. # Avoid division by 0
cos2phi = (dx*dx-dy*dy)/drsq
sin2phi = 2*dx*dy/drsq
g1 = -wg*cos2phi
g2 = -wg*sin2phi
return g1, g2
答案 1 :(得分:0)
我不认为'vectorize'适用于cython。在numpy
中,您可以使用快速编译的代码进行向量化,例如+
,*
,sum
等操作。有一个np.vectorize
函数,但它只是将代码包装在一个理解广播和多维数组的迭代器中。它不会重写你的功能,也不会加快它的速度。
Cython用于加速无法在现有编译向量操作中表达的numpy代码。它通过编译快速C迭代来获得速度。
表面上i
中的getSh
循环看起来很快(c样式),但它会调用self.__kappa
和self.__gamma
。两者都加载np
来电 - np.array
,np.where
,np.log
等。通过这些来电,您无法获得纯粹的{ {1}}代码。
您需要关注这两种方法,将它们表示为对数字的简单操作,并根据需要显式迭代c
样式。