Question

在两种情况下考虑优化的cython代码：

    for j in xrange(8):
        for x in xrange(1, 600):
            tmp[j] = 0.0
            for y in xrange(1, 800):
                tmp[j] += mag[j, x - 1, y - 1]
                hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]

和

    for j in prange(8):  # < prange used for parallelization with openmp
        for x in xrange(1, 600):
            tmp[j] = 0.0
            for y in xrange(1, 800):
                tmp[j] += mag[j, x - 1, y - 1]
                hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]

在这两种情况下，代码都在声明为 nogil 的本机函数内，并且具有内存视图和优化布局的numpy数组。第一个案例的benchamrked运行时间 14.97 msecs，而第二个案例的 26.64 大约加倍!!

我有其他功能，使用 prange 可以大大提高我的多核机器的性能，除了上述情况，我不明白发生了什么。

关于为什么 prange 会降低代码速度的任何想法？

FWIW，这是完整的原始代码：

# cython: boundscheck=False
# cython: wraparound=False
# cython: nonecheck=False
# cython: overflowcheck.fold=True
# cython: embedsignature=False
# cython: cdivision=True
# cython: cdivision_warnings=False
# cython: always_allow_keywords=False
# cython: profile=False
# cython: linetrace=False
# cython: infer_types=False
# cython: language_level=2
# cython: c_string_type=unicode
# cython: c_string_encoding=utf-8
# cython: type_version_tag=True
# cython: unraisable_tracebacks=True
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel import prange

DTYPE = np.int
ctypedef np.int_t DTYPE_t
UITYPE = np.uint
ctypedef np.uint_t UITYPE_t
U8TYPE = np.uint8
ctypedef np.uint8_t U8TYPE_t 
F32TYPE = np.float32
ctypedef np.float32_t F32TYPE_t
F64TYPE = np.float64
ctypedef np.float64_t F64TYPE_t
ctypedef Py_ssize_t DSIZE_t

cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag,
                                F64TYPE_t [:, :, ::1] hgi_out) nogil:
    cdef DSIZE_t m, n, x, y, j, dims = mag.shape[0]
    cdef F64TYPE_t [32] tmp
    cdef F64TYPE_t val = 0
    m, n = mag.shape[1] + 1, mag.shape[2] + 1
    for j in prange(dims):
        for x in xrange(1, m):
            tmp[j] = 0.0
            for y in xrange(1, n):
                tmp[j] += mag[j, x - 1, y - 1]
                hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]

def hog_integral_b(mag, hgi_out=None, orientations=8):
    if hgi_out is None:
        hgi_out = np.zeros((orientations + 1, mag.shape[0] + 1, mag.shape[1] + 1), dtype=F64TYPE)
    native_hog_integral_b(mag, hgi_out)
    return hgi_out

要测试上述代码，请尝试：

mg2 = np.random.rand(9, 600, 800).astype(F64TYPE)
hg2 = np.zeros((9, mg2.shape[1] + 1, mg2.shape[2] + 1), dtype=F64TYPE)
print timeit(lambda:hog_integral_b(mg2, hgi_out=hg2), number=10)

更新：

好的，我仔细查看了我的setup.py编译器选项：

from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np

ext_modules = [Extension("hog_cy", ["hog_cy.pyx"],
                         #extra_compile_args = ["-O3", "-fopenmp", "-fno-strict-aliasing"],
                         extra_compile_args = ["-O3", "-fopenmp"],
                         extra_link_args=["-fopenmp"]
                        )]

setup (
    name = 'performance test app',
    cmdclass = {'build_ext': build_ext},
    include_dirs = [np.get_include()],
    ext_modules = ext_modules,
)

选项-fno-strict-aliasing似乎正在解决问题，一旦关闭，我没有加速，但也没有损失。

Answer 1

你正在制作一场GIL战斗，因为prange不在nogil区域内。您的代码中没有并发性，只有多个线程竞争GIL所有权：

cimport cython
from cython.parallel cimport prange, parallel

cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag, 
                                F64TYPE_t [:, :, ::1] hgi_out):

    cdef DSIZE_t m, n, j, dims = mag.shape[0]
    cdef F64TYPE_t val = 0
    m, n = mag.shape[1] + 1, mag.shape[2] + 1
    with nogil, parallel():
        cdef DSIZE_t x, y
        cdef F64TYPE_t tmp
        for j in prange(dims):
            for x in range(1, m):
                tmp = 0.0
                for y in range(1, n):
                    tmp += mag[j, x - 1, y - 1]
                    hgi_out[j, x, y] = tmp + hgi_out[j, x - 1, y]

Answer 2

cimport cython
from cython.parallel cimport prange, parallel

cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag, 
                                F64TYPE_t [:, :, ::1] hgi_out):

    cdef DSIZE_t m, n, j, dims = mag.shape[0]
    cdef F64TYPE_t val = 0
    m, n = mag.shape[1] + 1, mag.shape[2] + 1
    with nogil, parallel():
        cdef DSIZE_t x, y
        cdef F64TYPE_t tmp
        for j in prange(dims):
            for x in range(1, m):
                tmp = 0.0
                for y in range(1, n):
                    tmp += mag[j, x - 1, y - 1]
                    hgi_out[j, x, y] = tmp + hgi_out[j, x - 1, y]

Cython 0.2：prange意外地降低了代码速度

2 个答案: