Bakeoff Part 1 Python vs Cython vs Cython Typed内存视图:LDA by Gibbs Sampling

时间:2013-04-16 14:33:20

标签: cython montecarlo lda

更新:内存视图获胜。 使用类型化内存视图的Cython:0.0253449

特别感谢lothario指出了几个重大变化。

阿勇。当然现在问题是,似乎不能对它们做很多算术(求和和乘法)。 原帖 灵感来自Implementing Topic Model with Python (numpy),速度非常慢。我认为对它进行cython化是个好主意。但是我只能弄清楚如何用cython将时间减半。这里有明显的阵列操作没有被优化 - 一些想法和建议将是最受欢迎的。我一直想玩cython,这似乎是一个很好的机会!

15个文件,每个约300字, python:39.6903322834 cython:19.2733114806 使用类型化内存视图的Cython:0.547822975

我特别想使用nogil,所以这可以进一步加速: 1)使用内存视图,是否有助于将nogil添加到循环中? 2)我有一个文档列表,每个文档由一组数字表示。什么是我使用的最佳C对象? nogil不适用于python对象。目前我把它作为数组列表。

我不是C恶魔,但欢迎任何进一步的优化建议。

来自朋友的Java实现,1000个文件,每个300字,3秒。

lda_pyx Cython代码

import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t




cdef class LDA:
    cdef int iteration, M
    cdef int[:] docSizes
    cdef double[:, ::1] n_k_w ,n_m_k
    #cdef
    cdef double[:] n_k
    cdef list k_m_n
    cdef list numbered_docs


    #def __init__(self,int iteration,int M,  np.ndarray[np.double_t, ndim=2] n_k_w ,np.ndarray[np.double_t, ndim=2]  n_m_k, np.ndarray[np.double_t, ndim=1] n_k,np.ndarray[np.int_t, ndim=1] docSizes, list numbered_docs, list k_m_n):
    def __init__(self,int iteration,int M,  double[:, ::1] n_k_w ,double[:, ::1] n_m_k, double[:] n_k, int[:] docSizes, list numbered_docs, list k_m_n):
        self.iteration = iteration
        self.M = M
        self.n_k_w = n_k_w
        self.n_m_k = n_m_k
        self.n_k = n_k
        self.k_m_n = k_m_n
        self.numbered_docs = numbered_docs
        self.docSizes = docSizes


    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _sample(self) :
        #cdef np.ndarray[np.double_t, ndim=2, mode="c"] n_k_w = self.n_k_w
        #cdef np.ndarray[np.double_t, ndim=2, mode="c"]  n_m_k = self.n_m_k
        #cdef np.ndarray[np.double_t, ndim=1, mode="c"] n_k = self.n_k
        cdef double[:, ::1] n_k_w = self.n_k_w
        cdef double[:] n_k = self.n_k
        cdef double[:, ::1]  n_m_k = self.n_m_k
        #cdef np.ndarray[np.int_t, ndim=1, mode="c"] docSizes = self.docSizes
        cdef int[:] docSizes = self.docSizes
        cdef  int m , n, t , k ,new_k
        #cdef np.ndarray[np.int_t, ndim=1, mode="c"] doc
        cdef int[:] doc

        for m in xrange(self.M):
            doc = self.numbered_docs[m]
            for n in xrange(docSizes[m]):

                t = doc[n]

                # discount for n-th word t with topic z
                k = self.k_m_n[m][n]
                #print k

                n_m_k[m,k] -= 1
                n_k_w[k,t] -= 1
                n_k[k] -= 1
                #print "ok"
                # sampling topic new_z for t
                #p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
                new_k = 1
                #np.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                self.k_m_n[m][n] = new_k
                #print n_m_k[m, new_k] ,"after"
                n_m_k[m, new_k] += 1
                #print n_m_k[m, new_k] ,"after"
                n_k_w[new_k][t] += 1
                n_k[new_k] += 1
        #print self.n_k_w ,"before"
        self.n_k_w = n_k_w
        #print self.n_k_w ,"after"
        self.n_m_k = n_m_k
        self.n_k = n_k
        #self.k_m_n = k_m_n

        return 1

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _iterate(self) :

        while self.iteration >0 :
            self._sample()
            self.iteration -= 1
        return 1

def iterate(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):
    cdef LDA lda
    lda= LDA(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
    lda._iterate()
    return lda.n_k_w , lda.n_m_k, lda.n_k , lda.k_m_n

纯python版

def gibbs_sample():
    for i in xrange(iteration):
    #print i
        for m in xrange(M):
            doc = numbered_docs[m]
            for n in xrange(docSizes[m]):
                #print t
                t = doc[n]

                # discount for n-th word t with topic z
                k = k_m_n[m][n]
                n_m_k[m][k] -= 1
                n_k_w[k][t] -= 1
                n_k[k] -= 1

                # sampling topic new_z for t
                #p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
                new_k = 1
                #np.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                k_m_n[m][n] = new_k
                n_m_k[m][new_k] += 1
                n_k_w[new_k][t] += 1
                n_k[new_k] += 1

CPROFILE

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.419    0.419 <string>:1(<module>)
        1    0.419    0.419    0.419    0.419 {lda_pyx.iterate}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}

1 个答案:

答案 0 :(得分:1)

关于nogil:

使用with nogil只需允许其他线程在没有全局锁定的情况下运行该代码块 - 您仍然需要运行该块中的多线程代码,并确保在执行此操作时不要触摸任何Python对象。类型化的内存视图不是Python对象,因此您可以在具有多个线程的nogil块中使用/操作它们。 Cython has the prange() functionwith nogil块内自动生成OpenMP指令。如果循环迭代彼此独立,则可以使用prange轻松获得良好的加速。这里有很多细节 - 请参阅链接文档。

关于您的代码:

专注于优化内循环中的代码。

在代码上使用cython -a会显示一些行可能会拖累您的效果。

  • 您可以直接索引到n_k_w[new_k,t]而不是您拥有的内容。

  • 通过将k_m_n列表转换为2D numpy数组,并在内部使用类型化的内存视图,您将获得改进。

  • 同样为numbered_docs

  • 每当您知道有连续数据时,您还需要使用arr[::1]类型的memoryview声明,否则Cython会将memview视为跨步,这将减慢访问速度。

请参阅下面的cython代码以获取一些建议 - 您可能需要触摸它以使其适合您的工作。

<强> lda.pyx

import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t


cdef class LDA:

    cdef:
        int iteration, M
        int[::1] docSizes
        double[:, ::1] n_k_w ,n_m_k
        double[::1] n_k
        list k_m_n, numbered_docs

    def __init__(self, iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n):
        self.iteration = iteration
        self.M = M
        self.n_k_w = n_k_w
        self.n_m_k = n_m_k
        self.n_k = n_k
        self.k_m_n = k_m_n
        self.numbered_docs = numbered_docs
        self.docSizes = docSizes

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _sample(self) :

        cdef:
            int[::1] docSizes = self.docSizes
            double[:, ::1] n_k_w = self.n_k_w, n_m_k = self.n_m_k
            double[::1] n_k = self.n_k
            int[::1] k_n, doc
            int m, n, t, k, new_k

        for m in range(self.M):
            k_n = self.k_m_n[m]
            doc = self.numbered_docs[m]
            for n in range(docSizes[m]):

                t = doc[n]
                k = k_n[n]

                n_m_k[m,k] -= 1
                n_k_w[k,t] -= 1
                n_k[k] -= 1
                new_k = 1

                # set z the new topic and increment counters
                k_n[n] = new_k
                n_m_k[m, new_k] += 1
                n_k_w[new_k, t] += 1
                n_k[new_k] += 1

        return 1

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _iterate(self) :

        while self.iteration >0 :
            self._sample()
            self.iteration -= 1
        return 1

def iterate(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):

    # pass array / list arguments through np.ascontiguousarray(), will make
    # copy only if not contiguous buffer already.
    ascontig = np.ascontiguousarray
    n_k_w = ascontig(n_k_w, dtype=np.double)
    n_m_k = ascontig(n_m_k, dtype=np.double)
    n_k = ascontig(n_k, dtype=np.double)
    docSizes = ascontig(docSizes, dtype=np.int32)
    k_m_n = [ascontig(k_n, dtype=np.int32) for k_n in k_m_n]
    numbered_docs = [ascontig(n_d, dtype=np.int32) for n_d in numbered_docs]

    cdef LDA lda
    lda= LDA(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
    lda._iterate()
    # since the lda object just grabs views of the n_k_w, n_m_k etc. arrays,
    # these will be modified, so return them directly.
    return n_k_w, n_m_k, n_k, k_m_n

<强> setup.py

import numpy as np
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

exts = [Extension("lda", ["lda.pyx"],
                  include_dirs=[np.get_include()])
        ]

setup(
    cmdclass = {'build_ext': build_ext},
    ext_modules = exts,
)

<强> test.py:

import numpy as np
from speedup import iterate

iteration = 10
M = 10
n_k_w = np.random.rand(10, 10)
n_m_k = np.random.rand(10, 10)
n_k = np.random.rand(10)
docSizes = np.zeros((10,), dtype=np.int32) + 10
numbered_docs = np.zeros((10, 10), dtype=np.int32) + 3
k_m_n = np.zeros((10, 10), dtype=np.int32) + 7
k_m_n_orig = k_m_n.copy()

iterate(iteration, M,  n_k_w, n_m_k, n_k, docSizes, numbered_docs, k_m_n)

print k_m_n_orig[1]
print k_m_n[1]