Question

我正在尝试加速我的代码，这部分内容给我带来了问题，

我尝试使用Cython然后遵循给定here的建议，但我的纯python函数比cython和cython_optimized函数表现更好

cython代码如下：

import numpy as np
cimport numpy as np

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False) 

def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile


def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):

    assert u.dtype == DTYPE
    assert PorosityProfile.dtype == DTYPE
    assert DensityIceProfile.dtype == DTYPE
    assert DensityDustProfile.dtype == DTYPE
    assert DensityProfile.dtype == DTYPE

    cdef float DustJ = 250.0
    cdef float DustF = 633.0  
    cdef float DustG = 2.513 
    cdef float DustH = -2.2e-3   
    cdef float DustI = -2.8e-6 
    cdef float IceI =  273.16
    cdef float IceC =  1.843e5 
    cdef float IceD =  1.6357e8 
    cdef float IceE =  3.5519e9 
    cdef float IceF =  1.6670e2 
    cdef float IceG =  6.4650e4
    cdef float IceH =  1.6935e6

    cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
    cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
    cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

然后我运行以下命令：

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

import sublimation
import numpy as np

%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

结果如下：

对于纯python：68.9 µs ± 851 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

对于非优化的cython：68.2 µs ± 685 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

对于优化的一个：72.7 µs ± 416 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

我做错了什么？

感谢您的帮助，

Answer 1

我普遍同意@chepner和@ juanpa.arrivillaga在评论中提出的建议。 Numpy是一个高性能的库，它执行的底层计算确实是用C语言编写的。此外，语法很简洁，在numpy数组的所有元素上应用标量运算是微不足道的。

然而，如果我们使用以下假设（并且可以容忍丑陋的代码），实际上有一种方法可以显着提高代码的性能，这要归功于您的特定算法的结构方式：

你的数组都是一维的，使迭代遍历数组中的每个项目非常简单。我们不需要替换像numpy.dot这样的更难的numpy函数，例如代码中的所有操作只将标量与矩阵结合起来。
虽然在python中使用for循环是不可想象的，但迭代遍历每个索引在cython中是非常可行的。此外，最终输出中的每个项目仅取决于与该项目索引对应的输入（即第0项使用u[0]，PorosityProfile[0]等。）
您对任何中间数组都不感兴趣，只对您compute_python函数中返回的最终结果感兴趣。因此，为什么浪费时间为所有这些中间numpy数组分配内存？
使用x**y语法的速度非常慢。我使用gcc编译器选项--ffast-math来显着改善这一点。我还使用了几个cython编译器指令来避免python检查和开销。
创建numpy数组本身可能会产生python开销，所以我使用了类型化的内存视图（首选的，更新的语法）和malloc-ed指针的组合来创建输出数组，而不需要与python交互（只有两行，获取输出大小和return语句显示了很大的python交互，如cython注释文件中所示。）

考虑到所有这些因素，这里是修改后的代码。它比我的笔记本电脑上的天真python版本快了近一个数量级。

<强> sublimation.pyx

from libc.stdlib cimport malloc, free

def compute_cython(float[:] u, float[:] porosity_profile, 
        float[:] density_ice_profile, float[:] density_dust_profile, 
        float[:] density_profile):    
    cdef:
        float dust_j, dust_f, dust_g, dust_h, dust_i
        float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h
        int size, i
        float dt, result_dust, x, dust
        float result_ice_numer, result_ice_denom, result_ice, ice
        float* out

    dust_j, dust_f, dust_g, dust_h, dust_i = \
        250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
    ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \
        273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
    size = len(u)
    out = <float *>malloc(size * sizeof(float))

    for i in range(size):
        dt = u[i] - dust_j
        result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3)
        x = u[i] / ice_i
        result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6)
        result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8
        result_ice = result_ice_numer / result_ice_denom
        ice = density_ice_profile[i]*result_ice
        dust = density_dust_profile[i]*result_dust
        out[i] = (dust + ice)/density_profile[i]
    return <float[:size]>out

<强> setup.py

from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension

def create_extension(ext_name):
    global language, libs, args, link_args
    path_parts = ext_name.split(".")
    path = "./{0}.pyx".format("/".join(path_parts))
    ext = Extension(ext_name, sources=[path], libraries=libs, language=language,
            extra_compile_args=args, extra_link_args=link_args)
    return ext

if __name__ == "__main__":
    libs = []#no external c libraries in this case
    language = "c"#chooses c rather than c++ since no c++ features were used
    args = ["-w", "-O3", "-ffast-math"]#assumes gcc is the compiler
    link_args = []#none here, could use -fopenmp for parallel code
    annotate = True#autogenerates .html files per .pyx
    directives = {#saves typing @cython decorators and applies them globally
        "boundscheck": False,
        "wraparound": False,
        "initializedcheck": False,
        "cdivision": True,
        "nonecheck": False,
    }

    ext_names = [
        "sublimation",
    ]

    extensions = [create_extension(ext_name) for ext_name in ext_names]
    setup(ext_modules = cythonize(
            extensions, 
            annotate=annotate, 
            compiler_directives=directives,
        )
    )

<强> main.py

import numpy as np
import sublimation as sub

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)
    x = u/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))
    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

size = 100
u = np.random.rand(size).astype(np.float32)
porosity = np.random.rand(size).astype(np.float32)
ice = np.random.rand(size).astype(np.float32)
dust = np.random.rand(size).astype(np.float32)
density = np.random.rand(size).astype(np.float32)

"""
Run these from the terminal to out the performance!
python3 -m timeit -s "from main import compute_python, u, porosity, ice, dust, density" "compute_python(u, porosity, ice, dust, density)"
python3 -m timeit -s "from main import sub, u, porosity, ice, dust, density" "sub.compute_cython(u, porosity, ice, dust, density)"
python3 -m timeit -s "import numpy as np; from main import sub, u, porosity, ice, dust, density" "np.asarray(sub.compute_cython(u, porosity, ice, dust, density))"

The first command tests the python version. (10000 loops, best of 3: 45.5 usec per loop)
The second command tests the cython version, but returns just a memoryview object. (100000 loops, best of 3: 4.63 usec per loop)
The third command tests the cython version, but converts the result to a ndarray (slower). (100000 loops, best of 3: 6.3 usec per loop)
"""

如果我的解释中有任何不清楚的部分可以告诉我这个答案是如何运作的，我希望它有所帮助！

更新1：

不幸的是，我无法让MSYS2和numba（依赖于LLVM）彼此玩得很好，所以我无法进行任何直接的比较。但是，根据@ max9111的建议，我将-march=native添加到args文件中的setup.py列表中;然而，时间与以前没有显着差异。

从this great answer开始，numpy数组和类型化内存视图之间的自动转换似乎有一些开销在初始函数调用中都会发生（如果你将结果转换回来，也会在return语句中进行））。恢复使用这样的函数签名：

ctypedef np.float32_t DTYPE_t
def compute_cython_np(
        np.ndarray[DTYPE_t, ndim=1] u, 
        np.ndarray[DTYPE_t, ndim=1] porosity_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_ice_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_dust_profile, 
        np.ndarray[DTYPE_t, ndim=1] density_profile):

每次通话节省大约1us，将其降低到大约3.6us而不是4.6us，这有点重要，特别是如果要多次调用该函数。当然，如果你打算多次调用这个函数，那么代替传递二维numpy数组可能会更有效率，从而节省了大量的python函数调用开销，并分摊了numpy array -> typed memoryview转换的成本此外，使用numpy结构化数组可能会很有趣，这些数组可以在cython中转换为结构的类型化内存视图，因为这可能会使所有数据在缓存中更加紧密，并加快内存访问时间。 / p>

作为前面评论中承诺的最后一个注释，这是一个使用prange的版本，它利用了并行处理。请注意，这只能用于类型化的内存视图，因为python的GIL必须在prange循环中释放（并使用-fopenmp和args的{{1}}标志进行编译：

link_args

更新2：

根据评论中@ max9111提供的非常有用的额外建议，我将代码中的所有from cython.parallel import prange from libc.stdlib cimport malloc, free def compute_cython_p(float[:] u, float[:] porosity_profile, float[:] density_ice_profile, float[:] density_dust_profile, float[:] density_profile): cdef: float dust_j, dust_f, dust_g, dust_h, dust_i float ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h int size, i float dt, result_dust, x, dust float result_ice_numer, result_ice_denom, result_ice, ice float* out dust_j, dust_f, dust_g, dust_h, dust_i = \ 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 ice_i, ice_c, ice_d, ice_e, ice_f, ice_g, ice_h = \ 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 size = len(u) out = <float *>malloc(size * sizeof(float)) for i in prange(size, nogil=True): dt = u[i] - dust_j result_dust = dust_f + (dust_g*dt) + (dust_h*dt**2) + (dust_i*dt**3) x = u[i] / ice_i result_ice_numer = x**3*(ice_c + ice_d*x**2 + ice_e*x**6) result_ice_denom = 1 + ice_f*x**2 + ice_g*x**4 + ice_h*x**8 result_ice = result_ice_numer / result_ice_denom ice = density_ice_profile[i]*result_ice dust = density_dust_profile[i]*result_dust out[i] = (dust + ice)/density_profile[i] return <float[:size]>out声明切换为float[:]。这样做的意义在于它允许数据连续存储，而cython不需要担心元素之间存在跨距。这允许SIMD矢量化，这极大地进一步优化了代码。以下是更新的时间，使用以下命令生成：

float[::1]

Answer 2

使用Numba的解决方案

CodeSurgeon已经使用Cython给出了一个很好的答案。在这个答案中，我不想展示使用Numba的另一种方式。

我创建了三个版本。在naive_numba我只添加了一个函数装饰器。在improved_Numba中我手动组合了循环（每个矢量化命令实际上都是一个循环）。在improved_Numba_p我已经并行化了这个功能。请注意，使用并联加速器时，显然存在不允许定义常量值的Bug。还应注意，并行化版本仅对较大的输入阵列有益。但您也可以添加一个小包装器，根据输入的数组大小调用单线程或并行化版本。

代码dtype = float64

import numba as nb import numpy as np import time @nb.njit(fastmath=True) def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 delta = u-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3); x= u/IceI; result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile #error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization @nb.njit(fastmath=True,error_model='numpy') def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 res=np.empty(u.shape[0],dtype=u.dtype) for i in range(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3); x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res #there is obviously a bug in Numba (declaring const values in the function) @nb.njit(fastmath=True,parallel=True,error_model='numpy') def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH): res=np.empty((u.shape[0]),dtype=u.dtype) for i in nb.prange(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3); x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res u=np.array(np.random.rand(1000000),dtype=np.float32) PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32) DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32) DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32) DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32) DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6 IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6 #don't measure compilation overhead on first call res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) for i in range(1000): res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) print(time.time()-t1) print(time.time()-t1)

<强>性能

Arraysize np.random.rand(100) Numpy 46.8µs naive Numba 3.1µs improved Numba: 1.62µs improved_Numba_p: 17.45µs #Arraysize np.random.rand(1000000) Numpy 255.8ms naive Numba 18.6ms improved Numba: 6.13ms improved_Numba_p: 3.54ms

代码dtype = np.float32

如果np.float32足够，则必须将函数中的所有常量值显式声明为float32。否则Numba将使用float64。

@nb.njit(fastmath=True,error_model='numpy') def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6) IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6) res=np.empty(u.shape[0],dtype=u.dtype) for i in range(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3) x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res @nb.njit(fastmath=True,parallel=True,error_model='numpy') def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile): res=np.empty((u.shape[0]),dtype=u.dtype) DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6) IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6) for i in nb.prange(u.shape[0]): delta = u[i]-DustJ result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3) x= u[i]/IceI result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8)) res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i] return res

<强>性能

Arraysize np.random.rand(100).astype(np.float32) Numpy 29.3µs improved Numba: 1.33µs improved_Numba_p: 18µs Arraysize np.random.rand(1000000).astype(np.float32) Numpy 117ms improved Numba: 2.46ms improved_Numba_p: 1.56ms

与@CodeSurgeon提供的Cython版本的比较并不公平，因为他没有使用启用的AVX2和FMA3指令编译该函数。 Numba默认使用-march = native编译，在我的Core i7-4xxx上启用AVX2和FMA3指令。

但是，如果您不想分发已编译的Cython版本的代码，那么就会发生这种情况，因为如果启用了优化，它将不会默认运行在Haswell之前的处理器（或所有Pentium和Celerons）上。应该可以编译多个代码路径，但编译器依赖并且工作量更大。

Cython函数比纯python花费更多时间

2 个答案:

使用Numba的解决方案