我正在尝试加速我的代码,这部分内容给我带来了问题,
我尝试使用Cython然后遵循给定here的建议,但我的纯python函数比cython和cython_optimized函数表现更好
cython代码如下:
import numpy as np
cimport numpy as np
DTYPE = np.float
ctypedef np.float_t DTYPE_t
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):
assert u.dtype == DTYPE
assert PorosityProfile.dtype == DTYPE
assert DensityIceProfile.dtype == DTYPE
assert DensityDustProfile.dtype == DTYPE
assert DensityProfile.dtype == DTYPE
cdef float DustJ = 250.0
cdef float DustF = 633.0
cdef float DustG = 2.513
cdef float DustH = -2.2e-3
cdef float DustI = -2.8e-6
cdef float IceI = 273.16
cdef float IceC = 1.843e5
cdef float IceD = 1.6357e8
cdef float IceE = 3.5519e9
cdef float IceF = 1.6670e2
cdef float IceG = 6.4650e4
cdef float IceH = 1.6935e6
cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
然后我运行以下命令:
def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
import sublimation
import numpy as np
%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))
结果如下:
对于纯python:68.9 µs ± 851 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
对于非优化的cython:68.2 µs ± 685 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
对于优化的一个:72.7 µs ± 416 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
我做错了什么?
感谢您的帮助,
答案 0 :(得分:3)
我普遍同意@chepner和@ juanpa.arrivillaga在评论中提出的建议。 Numpy是一个高性能的库,它执行的底层计算确实是用C语言编写的。此外,语法很简洁,在numpy数组的所有元素上应用标量运算是微不足道的。
然而,如果我们使用以下假设(并且可以容忍丑陋的代码),实际上有一种方法可以显着提高代码的性能,这要归功于您的特定算法的结构方式:
numpy.dot
这样的更难的numpy函数,例如代码中的所有操作只将标量与矩阵结合起来。for
循环是不可想象的,但迭代遍历每个索引在cython中是非常可行的。此外,最终输出中的每个项目仅取决于与该项目索引对应的输入(即第0项使用u[0]
,PorosityProfile[0]
等。)compute_python
函数中返回的最终结果感兴趣。因此,为什么浪费时间为所有这些中间numpy数组分配内存?x**y
语法的速度非常慢。我使用gcc
编译器选项--ffast-math
来显着改善这一点。我还使用了几个cython编译器指令来避免python检查和开销。考虑到所有这些因素,这里是修改后的代码。它比我的笔记本电脑上的天真python版本快了近一个数量级。
<强> sublimation.pyx 强>
from libc.stdlib cimport malloc, free
def compute_cython(float[:] u, float[:] porosity_profile,
float[:] density_ice_profile, float[:] density_dust_profile,
float[:] density_profile):
cdef:
float dust_j, dust_f, dust_g, dust_h, dust_i
float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
int size, i
float dt, result_dust, x, dust
float result_ice_numer, result_ice_denom, result_ice, ice
float* out
dust_j, dust_f, dust_g, dust_h, dust_i = \
250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
size = len(u)
out = <float *>malloc(size * sizeof(float))
for i in range(size):
dt = u[i] - dust_j
result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
x = u[i] / ice_i
result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
result_ice = result_ice_numer / result_ice_denom
ice = density_ice_profile[i]*result_ice
dust = density_dust_profile[i]*result_dust
out[i] = (dust + ice)/density_profile[i]
return <float[:size]>out
<强> setup.py 强>
from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
def create_extension(ext_name):
global language, libs, args, link_args
path_parts = ext_name.split(".")
path = "./{0}.pyx".format("/".join(path_parts))
ext = Extension(ext_name, sources=[path], libraries=libs, language=language,
extra_compile_args=args, extra_link_args=link_args)
return ext
if __name__ == "__main__":
libs = []#no external c libraries in this case
language = "c"#chooses c rather than c++ since no c++ features were used
args = ["-w", "-O3", "-ffast-math"]#assumes gcc is the compiler
link_args = []#none here, could use -fopenmp for parallel code
annotate = True#autogenerates .html files per .pyx
directives = {#saves typing @cython decorators and applies them globally
"boundscheck": False,
"wraparound": False,
"initializedcheck": False,
"cdivision": True,
"nonecheck": False,
}
ext_names = [
"sublimation",
]
extensions = [create_extension(ext_name) for ext_name in ext_names]
setup(ext_modules = cythonize(
extensions,
annotate=annotate,
compiler_directives=directives,
)
)
<强> main.py 强>
import numpy as np
import sublimation as sub
def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x = u/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
size = 100
u = np.random.rand(size).astype(np.float32)
porosity = np.random.rand(size).astype(np.float32)
ice = np.random.rand(size).astype(np.float32)
dust = np.random.rand(size).astype(np.float32)
density = np.random.rand(size).astype(np.float32)
"""
Run these from the terminal to out the performance!
python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "from main import sub, u, porosity, ice, dust, density" "sub.compute_cython(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"
The first command tests the python version. (10000 loops, best of 3: 45.5 usec per loop)
The second command tests the cython version, but returns just a memoryview object. (100000 loops, best of 3: 4.63 usec per loop)
The third command tests the cython version, but converts the result to a ndarray (slower). (100000 loops, best of 3: 6.3 usec per loop)
"""
如果我的解释中有任何不清楚的部分可以告诉我这个答案是如何运作的,我希望它有所帮助!
更新1:
不幸的是,我无法让MSYS2和numba(依赖于LLVM)彼此玩得很好,所以我无法进行任何直接的比较。但是,根据@ max9111的建议,我将-march=native
添加到args
文件中的setup.py
列表中;然而,时间与以前没有显着差异。
从this great answer开始,numpy数组和类型化内存视图之间的自动转换似乎有一些开销在初始函数调用中都会发生(如果你将结果转换回来,也会在return语句中进行) )。恢复使用这样的函数签名:
ctypedef np.float32_t DTYPE_t
def compute_cython_np(
np.ndarray[DTYPE_t, ndim=1] u,
np.ndarray[DTYPE_t, ndim=1] porosity_profile,
np.ndarray[DTYPE_t, ndim=1] density_ice_profile,
np.ndarray[DTYPE_t, ndim=1] density_dust_profile,
np.ndarray[DTYPE_t, ndim=1] density_profile):
每次通话节省大约1us,将其降低到大约3.6us而不是4.6us,这有点重要,特别是如果要多次调用该函数。 当然,如果你打算多次调用这个函数,那么代替传递二维numpy数组可能会更有效率,从而节省了大量的python函数调用开销,并分摊了numpy array -> typed memoryview
转换的成本此外,使用numpy结构化数组可能会很有趣,这些数组可以在cython中转换为结构的类型化内存视图,因为这可能会使所有数据在缓存中更加紧密,并加快内存访问时间。 / p>
作为前面评论中承诺的最后一个注释,这是一个使用prange
的版本,它利用了并行处理。请注意,这只能用于类型化的内存视图,因为python的GIL必须在prange循环中释放(并使用-fopenmp
和args
的{{1}}标志进行编译:
link_args
更新2:
根据评论中@ max9111提供的非常有用的额外建议,我将代码中的所有from cython.parallel import prange
from libc.stdlib cimport malloc, free
def compute_cython_p(float[:] u, float[:] porosity_profile,
float[:] density_ice_profile, float[:] density_dust_profile,
float[:] density_profile):
cdef:
float dust_j, dust_f, dust_g, dust_h, dust_i
float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
int size, i
float dt, result_dust, x, dust
float result_ice_numer, result_ice_denom, result_ice, ice
float* out
dust_j, dust_f, dust_g, dust_h, dust_i = \
250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
size = len(u)
out = <float *>malloc(size * sizeof(float))
for i in prange(size, nogil=True):
dt = u[i] - dust_j
result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
x = u[i] / ice_i
result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
result_ice = result_ice_numer / result_ice_denom
ice = density_ice_profile[i]*result_ice
dust = density_dust_profile[i]*result_dust
out[i] = (dust + ice)/density_profile[i]
return <float[:size]>out
声明切换为float[:]
。这样做的意义在于它允许数据连续存储,而cython不需要担心元素之间存在跨距。这允许SIMD矢量化,这极大地进一步优化了代码。以下是更新的时间,使用以下命令生成:
float[::1]
答案 1 :(得分:3)
我创建了三个版本。在naive_numba
我只添加了一个函数装饰器。在improved_Numba
中我手动组合了循环(每个矢量化命令实际上都是一个循环)。在improved_Numba_p
我已经并行化了这个功能。请注意,使用并联加速器时,显然存在不允许定义常量值的Bug。还应注意,并行化版本仅对较大的输入阵列有益。但您也可以添加一个小包装器,根据输入的数组大小调用单线程或并行化版本。
代码dtype = float64
import numba as nb
import numpy as np
import time
@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile
#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
res=np.empty(u.shape[0],dtype=u.dtype)
for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
res=np.empty((u.shape[0]),dtype=u.dtype)
for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
for i in range(1000):
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
print(time.time()-t1)
print(time.time()-t1)
<强>性能强>
Arraysize np.random.rand(100)
Numpy 46.8µs
naive Numba 3.1µs
improved Numba: 1.62µs
improved_Numba_p: 17.45µs
#Arraysize np.random.rand(1000000)
Numpy 255.8ms
naive Numba 18.6ms
improved Numba: 6.13ms
improved_Numba_p: 3.54ms
代码dtype = np.float32
如果np.float32足够,则必须将函数中的所有常量值显式声明为float32。否则Numba将使用float64。
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
res=np.empty(u.shape[0],dtype=u.dtype)
for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
res=np.empty((u.shape[0]),dtype=u.dtype)
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]
return res
<强>性能强>
Arraysize np.random.rand(100).astype(np.float32)
Numpy 29.3µs
improved Numba: 1.33µs
improved_Numba_p: 18µs
Arraysize np.random.rand(1000000).astype(np.float32)
Numpy 117ms
improved Numba: 2.46ms
improved_Numba_p: 1.56ms
与@CodeSurgeon提供的Cython版本的比较并不公平,因为他没有使用启用的AVX2和FMA3指令编译该函数。 Numba默认使用-march = native编译,在我的Core i7-4xxx上启用AVX2和FMA3指令。
但是,如果您不想分发已编译的Cython版本的代码,那么就会发生这种情况,因为如果启用了优化,它将不会默认运行在Haswell之前的处理器(或所有Pentium和Celerons)上。应该可以编译多个代码路径,但编译器依赖并且工作量更大。