Question

我试图在cython中实现一个简单的滚动平均值。我在非常大的数据集中运行它作为算法模拟的一部分，因此使用pandas.rolling等不是一种选择。

然而，我面临一个非常令人烦恼的情况，即cython在课堂方法上表现不佳。以下是代码。

class RollingAverage:
    def __init__(self, length):
        self.current = 0
        self.ma = 0
        self.window_length = length
        self.window = np.zeros(length, dtype=np.float32)

    def mean(self):
        return self.ma

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def update(self, value):
        self.ma += (value - self.window[self.current % self.window_length]) / self.window_length
        self.window[self.current % self.window_length] = value
        self.current += 1
        return self.ma

    def update2(self,value):
        self.ma = __update_impl( self.ma,
                                 self.current,
                                 self.window_length,
                                 self.window,
                                 value )
        self.current += 1
        return self.ma

@cython.boundscheck(False)
@cython.wraparound(False)
def __update_impl(ma, current, window_length, window, value):
    ma += (value - window[current % window_length]) / window_length
    window[current % window_length] = value
    return ma

我还有一个pxd文件，其中定义了以下内容：

cdef class RollingAverage:
    cdef int current
    cdef float ma
    cdef int window_length
    cdef np.ndarray window
    cpdef update(self, float value)
    cpdef update2(self, float value)
    cpdef mean(self)


cdef float __update_impl(float ma,
                  int current,
                  int window_length,
                  np.ndarray[float] window,
                  float value)

编辑：这里是update和__update_impl中热门代码的cython注释差异：

def update(self, value):

+0127:      self.ma += (value - self.window[self.current % self.window_length]) / self.window_length

  __pyx_t_1 = PyFloat_FromDouble(__pyx_v_self->ma); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = PyFloat_FromDouble(__pyx_v_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  if (unlikely(__pyx_v_self->window_length == 0)) {
    PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
    __PYX_ERR(0, 127, __pyx_L1_error)
  }
  __pyx_t_8 = __Pyx_mod_int(__pyx_v_self->current, __pyx_v_self->window_length);
  __pyx_t_4 = __Pyx_GetItemInt(((PyObject *)__pyx_v_self->window), __pyx_t_8, int, 1, __Pyx_PyInt_From_int, 0, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_6 = PyNumber_Subtract(__pyx_t_2, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_6);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
  __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_self->window_length); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_2 = __Pyx_PyNumber_Divide(__pyx_t_6, __pyx_t_4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
  __pyx_t_4 = PyNumber_InPlaceAdd(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_t_7 = __pyx_PyFloat_AsFloat(__pyx_t_4); if (unlikely((__pyx_t_7 == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 127, __pyx_L1_error)
  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
  __pyx_v_self->ma = __pyx_t_7;

+0129:      self.current += 1
+0130:      return self.ma


def __update_impl(ma, current, window_length, window, value):

+0146:  ma += (value - window[current % window_length]) / window_length

  if (unlikely(__pyx_v_window_length == 0)) {
    PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
    __PYX_ERR(0, 146, __pyx_L1_error)
  }
  __pyx_t_1 = __Pyx_mod_int(__pyx_v_current, __pyx_v_window_length);
  __pyx_t_2 = (__pyx_v_value - (*__Pyx_BufPtrStrided1d(float *, __pyx_pybuffernd_window.rcbuffer->pybuffer.buf, __pyx_t_1, __pyx_pybuffernd_window.diminfo[0].strides)));
  if (unlikely(__pyx_v_window_length == 0)) {
    PyErr_SetString(PyExc_ZeroDivisionError, "float division");
    __PYX_ERR(0, 146, __pyx_L1_error)
  }
  __pyx_v_ma = (__pyx_v_ma + (__pyx_t_2 / ((float)__pyx_v_window_length)));

+0147:  window[current % window_length] = value
+0148:  return ma

方法update几乎比update2慢一个数量级。

%%timeit
ma.update(1000)
The slowest run took 11.68 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3 µs per loop

%%timeit
ma.update2(1000)
The slowest run took 22.90 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 416 ns per loop

此外，当我跑＆＃34; cython -a＆＃34;在源代码中，我在方法中得到了一些非常虚假的东西，而函数中没有一丝黄色，几乎完全编译成C代码。

我还试图用我在cython中声明的局部变量替换所有self.变量，但无济于事。出于某种原因，在方法中使用代码会强制进行大量测试和转换。

我错过了什么？

Answer 1

在慢速版本的快速版本V ndarray[float]中，关键区别为ndarray。在不知道数组数据类型的情况下，Cython无法进行任何真正的优化访问。

你必须这样做是因为fully typed ndarrays aren't allowed as class members。最好的解决方案是使用类型化的内存视图而不是float [:]，这在很大程度上是相同的。（如果您特别需要访问ndarray来调用其中一个方法，那么您可以通过memoryview的.base属性获取它）

与函数相比，类方法的cython优化令人沮丧

1 个答案: