Question

我尝试使用cython.parallel prange。我只能看到50％正在使用的两个核心。如何使用所有核心。即将循环发送到核心同时共享数组，音量和mc_vol？

编辑：我还编辑了纯粹的顺序for循环，比cython.parallel prange版本快30秒。它们都只使用一个核心。有没有办法并行化这个。

cimport cython
from cython.parallel import prange, parallel, threadid
from libc.stdio cimport sprintf
from libc.stdlib cimport malloc, free
cimport numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef MC_Surface(np.ndarray[np.int_t,ndim=3] volume, np.ndarray[np.float32_t,ndim=3] mc_vol):
     cdef int vol_len=len(volume)-1
     cdef int k, j, i
     cdef char* pattern # a string pointer - allocate later
     Perm_area = {
            "00000000": 0.000000,
            ...
            "00011101": 1.515500
        }

         try:
         pattern = <char*>malloc(sizeof(char)*260)
         for k in range(vol_len):
             for j in range(vol_len):
                for i in range(vol_len):
                    sprintf(pattern, "%i%i%i%i%i%i%i%i",
                            volume[i, j, k],
                            volume[i, j + 1, k],
                            volume[i + 1, j, k],
                            volume[i + 1, j + 1, k],
                            volume[i, j, k + 1],
                            volume[i, j + 1, k + 1],
                            volume[i + 1, j, k + 1],
                            volume[i + 1, j + 1, k + 1]);

                    mc_vol[i, j, k] = Perm_area[pattern]
                # if Perm_area[pattern] > 0:
            #    print pattern, 'Area: ', Perm_area[pattern]
            #total_area += Perm_area[pattern]
    finally:
        free(pattern)
return mc_vol

编辑遵循DavidW的建议，但是prange相当慢：

 cpdef MC_Surface(np.ndarray[np.int_t,ndim=3] volume, np.ndarray[np.float32_t,ndim=3] mc_vol):
     cdef int vol_len=len(volume)-1
     cdef int k, j, i
     cdef char* pattern # a string pointer - allocate later
     Perm_area = {
            "00000000": 0.000000,
            ...
            "00011101": 1.515500
        }

        with nogil,parallel():
           try:
             pattern = <char*>malloc(sizeof(char)*260)
             for k in prange(vol_len):
                 for j in range(vol_len):
                    for i in range(vol_len):
                        sprintf(pattern, "%i%i%i%i%i%i%i%i",
                                volume[i, j, k],
                                volume[i, j + 1, k],
                                volume[i + 1, j, k],
                                volume[i + 1, j + 1, k],
                                volume[i, j, k + 1],
                                volume[i, j + 1, k + 1],
                                volume[i + 1, j, k + 1],
                                volume[i + 1, j + 1, k + 1]);
                        with gil:
                            mc_vol[i, j, k] = Perm_area[pattern]
                            # if Perm_area[pattern] > 0:
                            #    print pattern, 'Area: ', Perm_area[pattern]
                            #    total_area += Perm_area[pattern]
           finally:
               free(pattern)

        return mc_vol

我的设置文件如下：

setup(
    name='SurfaceArea',
    ext_modules=[
        Extension('c_marchSurf', ['c_marchSurf.pyx'], include_dirs=[numpy.get_include()],
                  extra_compile_args=['-fopenmp'], extra_link_args=['-fopenmp'], language="c++")
    ],
    cmdclass={'build_ext': build_ext}, requires=['Cython', 'numpy', 'matplotlib', 'pathos', 'scipy', 'cython.parallel']
)

Answer 1

问题是with gil:，它定义了一个只能在一个核心上同时运行的块。你没有在循环中做任何其他事情，所以你不应该期望任何加速。

为了避免使用GIL，您需要尽可能避免使用Python功能。通过使用c sprintf创建字符串，可以在字符串格式化部分中避免使用它。对于字典查找部分，最简单的事情可能是使用C ++标准库，其中包含具有类似行为的map类。 （请注意，您现在需要使用Cython的C ++模式进行编译）

# at the top of your file
from libc.stdio cimport sprintf
from libc.stdlib cimport malloc, free
from libcpp.map cimport map
from libcpp.string cimport string
import numpy as np
cimport numpy as np

# ... code omitted  ....
cpdef MC_Surface(np.ndarray[np.int_t,ndim=3] volume, np.ndarray[np.float32_t,ndim=3] mc_vol):
    # note above I've defined volume as a numpy array so that
    # I can do fast, GIL-less direct array lookup
    cdef char* pattern # a string pointer - allocate later

    Perm_area = {} # some dictionary, as before

    # depending on the size of Perm_area, this conversion to
    # a C++ object is potentially quite slow (it involves a lot
    # of string copies)
    cdef map[string,float] Perm_area_m = Perm_area

    # ... code omitted ...
    with nogil,parallel():
       try:
         # assigning pattern here makes it thread local
         # it's assigned once per thread which isn't too bad
         pattern = <char*>malloc(sizeof(char)*50)
         # when you allocate pattern you need to make it big enough
         # either by calculating a size, or by just making it overly big

         # ... more code omitted...
           # then later, inside your loops
           sprintf(pattern, "%i%i%i%i%i%i%i%i", volume[i, j, k],
                        volume[i, j + 1, k],
                        volume[i + 1, j, k],
                        volume[i + 1, j + 1, k],
                        volume[i, j, k + 1],
                        volume[i, j + 1, k + 1],
                        volume[i + 1, j, k + 1],
                        volume[i + 1, j + 1, k + 1]);
           # and now do the dictionary lookup without the GIL
           # because we're using the C++ class instead.
           # Unfortunately, we also need to do a string copy (which might slow things down)
           mc_vol[i, j, k] = Perm_area_m[string(pattern)]
           # be aware that this can throw an exception if the
           # pattern does not match (same as Python).
       finally:
         free(pattern)

我还必须将卷更改为numpy数组，因为如果它只是一个Python对象，我需要GIL来索引它的元素。

（编辑：更改为使用C ++地图将字典查找从GIL块中删除）

cython.parallel看不出速度上的差异

1 个答案: