在两种情况下考虑优化的cython代码:
for j in xrange(8):
for x in xrange(1, 600):
tmp[j] = 0.0
for y in xrange(1, 800):
tmp[j] += mag[j, x - 1, y - 1]
hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]
和
for j in prange(8): # < prange used for parallelization with openmp
for x in xrange(1, 600):
tmp[j] = 0.0
for y in xrange(1, 800):
tmp[j] += mag[j, x - 1, y - 1]
hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]
在这两种情况下,代码都在声明为 nogil 的本机函数内,并且具有内存视图和优化布局的numpy数组。第一个案例的benchamrked运行时间 14.97 msecs,而第二个案例的 26.64 大约加倍!!
我有其他功能,使用 prange 可以大大提高我的多核机器的性能,除了上述情况,我不明白发生了什么。
关于为什么 prange 会降低代码速度的任何想法?
FWIW,这是完整的原始代码:
# cython: boundscheck=False
# cython: wraparound=False
# cython: nonecheck=False
# cython: overflowcheck.fold=True
# cython: embedsignature=False
# cython: cdivision=True
# cython: cdivision_warnings=False
# cython: always_allow_keywords=False
# cython: profile=False
# cython: linetrace=False
# cython: infer_types=False
# cython: language_level=2
# cython: c_string_type=unicode
# cython: c_string_encoding=utf-8
# cython: type_version_tag=True
# cython: unraisable_tracebacks=True
from __future__ import division
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel import prange
DTYPE = np.int
ctypedef np.int_t DTYPE_t
UITYPE = np.uint
ctypedef np.uint_t UITYPE_t
U8TYPE = np.uint8
ctypedef np.uint8_t U8TYPE_t
F32TYPE = np.float32
ctypedef np.float32_t F32TYPE_t
F64TYPE = np.float64
ctypedef np.float64_t F64TYPE_t
ctypedef Py_ssize_t DSIZE_t
cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag,
F64TYPE_t [:, :, ::1] hgi_out) nogil:
cdef DSIZE_t m, n, x, y, j, dims = mag.shape[0]
cdef F64TYPE_t [32] tmp
cdef F64TYPE_t val = 0
m, n = mag.shape[1] + 1, mag.shape[2] + 1
for j in prange(dims):
for x in xrange(1, m):
tmp[j] = 0.0
for y in xrange(1, n):
tmp[j] += mag[j, x - 1, y - 1]
hgi_out[j, x, y] = tmp[j] + hgi_out[j, x - 1, y]
def hog_integral_b(mag, hgi_out=None, orientations=8):
if hgi_out is None:
hgi_out = np.zeros((orientations + 1, mag.shape[0] + 1, mag.shape[1] + 1), dtype=F64TYPE)
native_hog_integral_b(mag, hgi_out)
return hgi_out
要测试上述代码,请尝试:
mg2 = np.random.rand(9, 600, 800).astype(F64TYPE)
hg2 = np.zeros((9, mg2.shape[1] + 1, mg2.shape[2] + 1), dtype=F64TYPE)
print timeit(lambda:hog_integral_b(mg2, hgi_out=hg2), number=10)
更新:
好的,我仔细查看了我的setup.py编译器选项:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np
ext_modules = [Extension("hog_cy", ["hog_cy.pyx"],
#extra_compile_args = ["-O3", "-fopenmp", "-fno-strict-aliasing"],
extra_compile_args = ["-O3", "-fopenmp"],
extra_link_args=["-fopenmp"]
)]
setup (
name = 'performance test app',
cmdclass = {'build_ext': build_ext},
include_dirs = [np.get_include()],
ext_modules = ext_modules,
)
选项-fno-strict-aliasing
似乎正在解决问题,一旦关闭,我没有加速,但也没有损失。
答案 0 :(得分:2)
你正在制作一场GIL战斗,因为prange不在nogil区域内。您的代码中没有并发性,只有多个线程竞争GIL所有权:
cimport cython
from cython.parallel cimport prange, parallel
cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag,
F64TYPE_t [:, :, ::1] hgi_out):
cdef DSIZE_t m, n, j, dims = mag.shape[0]
cdef F64TYPE_t val = 0
m, n = mag.shape[1] + 1, mag.shape[2] + 1
with nogil, parallel():
cdef DSIZE_t x, y
cdef F64TYPE_t tmp
for j in prange(dims):
for x in range(1, m):
tmp = 0.0
for y in range(1, n):
tmp += mag[j, x - 1, y - 1]
hgi_out[j, x, y] = tmp + hgi_out[j, x - 1, y]
答案 1 :(得分:0)
cimport cython
from cython.parallel cimport prange, parallel
cdef void native_hog_integral_b(F64TYPE_t [:, :, ::1] mag,
F64TYPE_t [:, :, ::1] hgi_out):
cdef DSIZE_t m, n, j, dims = mag.shape[0]
cdef F64TYPE_t val = 0
m, n = mag.shape[1] + 1, mag.shape[2] + 1
with nogil, parallel():
cdef DSIZE_t x, y
cdef F64TYPE_t tmp
for j in prange(dims):
for x in range(1, m):
tmp = 0.0
for y in range(1, n):
tmp += mag[j, x - 1, y - 1]
hgi_out[j, x, y] = tmp + hgi_out[j, x - 1, y]