更新:内存视图获胜。 使用类型化内存视图的Cython:0.0253449
特别感谢lothario指出了几个重大变化。
阿勇。当然现在问题是,似乎不能对它们做很多算术(求和和乘法)。 原帖 灵感来自Implementing Topic Model with Python (numpy),速度非常慢。我认为对它进行cython化是个好主意。但是我只能弄清楚如何用cython将时间减半。这里有明显的阵列操作没有被优化 - 一些想法和建议将是最受欢迎的。我一直想玩cython,这似乎是一个很好的机会!
15个文件,每个约300字, python:39.6903322834 cython:19.2733114806 使用类型化内存视图的Cython:0.547822975
我特别想使用nogil,所以这可以进一步加速: 1)使用内存视图,是否有助于将nogil添加到循环中? 2)我有一个文档列表,每个文档由一组数字表示。什么是我使用的最佳C对象? nogil不适用于python对象。目前我把它作为数组列表。
我不是C恶魔,但欢迎任何进一步的优化建议。
来自朋友的Java实现,1000个文件,每个300字,3秒。
lda_pyx Cython代码
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef class LDA:
cdef int iteration, M
cdef int[:] docSizes
cdef double[:, ::1] n_k_w ,n_m_k
#cdef
cdef double[:] n_k
cdef list k_m_n
cdef list numbered_docs
#def __init__(self,int iteration,int M, np.ndarray[np.double_t, ndim=2] n_k_w ,np.ndarray[np.double_t, ndim=2] n_m_k, np.ndarray[np.double_t, ndim=1] n_k,np.ndarray[np.int_t, ndim=1] docSizes, list numbered_docs, list k_m_n):
def __init__(self,int iteration,int M, double[:, ::1] n_k_w ,double[:, ::1] n_m_k, double[:] n_k, int[:] docSizes, list numbered_docs, list k_m_n):
self.iteration = iteration
self.M = M
self.n_k_w = n_k_w
self.n_m_k = n_m_k
self.n_k = n_k
self.k_m_n = k_m_n
self.numbered_docs = numbered_docs
self.docSizes = docSizes
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _sample(self) :
#cdef np.ndarray[np.double_t, ndim=2, mode="c"] n_k_w = self.n_k_w
#cdef np.ndarray[np.double_t, ndim=2, mode="c"] n_m_k = self.n_m_k
#cdef np.ndarray[np.double_t, ndim=1, mode="c"] n_k = self.n_k
cdef double[:, ::1] n_k_w = self.n_k_w
cdef double[:] n_k = self.n_k
cdef double[:, ::1] n_m_k = self.n_m_k
#cdef np.ndarray[np.int_t, ndim=1, mode="c"] docSizes = self.docSizes
cdef int[:] docSizes = self.docSizes
cdef int m , n, t , k ,new_k
#cdef np.ndarray[np.int_t, ndim=1, mode="c"] doc
cdef int[:] doc
for m in xrange(self.M):
doc = self.numbered_docs[m]
for n in xrange(docSizes[m]):
t = doc[n]
# discount for n-th word t with topic z
k = self.k_m_n[m][n]
#print k
n_m_k[m,k] -= 1
n_k_w[k,t] -= 1
n_k[k] -= 1
#print "ok"
# sampling topic new_z for t
#p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
new_k = 1
#np.random.multinomial(1, p_z / p_z.sum()).argmax()
# set z the new topic and increment counters
self.k_m_n[m][n] = new_k
#print n_m_k[m, new_k] ,"after"
n_m_k[m, new_k] += 1
#print n_m_k[m, new_k] ,"after"
n_k_w[new_k][t] += 1
n_k[new_k] += 1
#print self.n_k_w ,"before"
self.n_k_w = n_k_w
#print self.n_k_w ,"after"
self.n_m_k = n_m_k
self.n_k = n_k
#self.k_m_n = k_m_n
return 1
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _iterate(self) :
while self.iteration >0 :
self._sample()
self.iteration -= 1
return 1
def iterate(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):
cdef LDA lda
lda= LDA(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
lda._iterate()
return lda.n_k_w , lda.n_m_k, lda.n_k , lda.k_m_n
纯python版
def gibbs_sample():
for i in xrange(iteration):
#print i
for m in xrange(M):
doc = numbered_docs[m]
for n in xrange(docSizes[m]):
#print t
t = doc[n]
# discount for n-th word t with topic z
k = k_m_n[m][n]
n_m_k[m][k] -= 1
n_k_w[k][t] -= 1
n_k[k] -= 1
# sampling topic new_z for t
#p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
new_k = 1
#np.random.multinomial(1, p_z / p_z.sum()).argmax()
# set z the new topic and increment counters
k_m_n[m][n] = new_k
n_m_k[m][new_k] += 1
n_k_w[new_k][t] += 1
n_k[new_k] += 1
CPROFILE
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.419 0.419 <string>:1(<module>)
1 0.419 0.419 0.419 0.419 {lda_pyx.iterate}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
答案 0 :(得分:1)
关于nogil:
使用with nogil
只需允许其他线程在没有全局锁定的情况下运行该代码块 - 您仍然需要运行该块中的多线程代码,并确保在执行此操作时不要触摸任何Python对象。类型化的内存视图不是Python对象,因此您可以在具有多个线程的nogil块中使用/操作它们。 Cython has the prange()
function在with nogil
块内自动生成OpenMP指令。如果循环迭代彼此独立,则可以使用prange
轻松获得良好的加速。这里有很多细节 - 请参阅链接文档。
关于您的代码:
专注于优化内循环中的代码。
在代码上使用cython -a
会显示一些行可能会拖累您的效果。
您可以直接索引到n_k_w[new_k,t]
而不是您拥有的内容。
通过将k_m_n
列表转换为2D numpy数组,并在内部使用类型化的内存视图,您将获得改进。
同样为numbered_docs
。
每当您知道有连续数据时,您还需要使用arr[::1]
类型的memoryview声明,否则Cython会将memview视为跨步,这将减慢访问速度。
请参阅下面的cython代码以获取一些建议 - 您可能需要触摸它以使其适合您的工作。
<强> lda.pyx 强>
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef class LDA:
cdef:
int iteration, M
int[::1] docSizes
double[:, ::1] n_k_w ,n_m_k
double[::1] n_k
list k_m_n, numbered_docs
def __init__(self, iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n):
self.iteration = iteration
self.M = M
self.n_k_w = n_k_w
self.n_m_k = n_m_k
self.n_k = n_k
self.k_m_n = k_m_n
self.numbered_docs = numbered_docs
self.docSizes = docSizes
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _sample(self) :
cdef:
int[::1] docSizes = self.docSizes
double[:, ::1] n_k_w = self.n_k_w, n_m_k = self.n_m_k
double[::1] n_k = self.n_k
int[::1] k_n, doc
int m, n, t, k, new_k
for m in range(self.M):
k_n = self.k_m_n[m]
doc = self.numbered_docs[m]
for n in range(docSizes[m]):
t = doc[n]
k = k_n[n]
n_m_k[m,k] -= 1
n_k_w[k,t] -= 1
n_k[k] -= 1
new_k = 1
# set z the new topic and increment counters
k_n[n] = new_k
n_m_k[m, new_k] += 1
n_k_w[new_k, t] += 1
n_k[new_k] += 1
return 1
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _iterate(self) :
while self.iteration >0 :
self._sample()
self.iteration -= 1
return 1
def iterate(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):
# pass array / list arguments through np.ascontiguousarray(), will make
# copy only if not contiguous buffer already.
ascontig = np.ascontiguousarray
n_k_w = ascontig(n_k_w, dtype=np.double)
n_m_k = ascontig(n_m_k, dtype=np.double)
n_k = ascontig(n_k, dtype=np.double)
docSizes = ascontig(docSizes, dtype=np.int32)
k_m_n = [ascontig(k_n, dtype=np.int32) for k_n in k_m_n]
numbered_docs = [ascontig(n_d, dtype=np.int32) for n_d in numbered_docs]
cdef LDA lda
lda= LDA(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
lda._iterate()
# since the lda object just grabs views of the n_k_w, n_m_k etc. arrays,
# these will be modified, so return them directly.
return n_k_w, n_m_k, n_k, k_m_n
<强> setup.py 强>
import numpy as np
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
exts = [Extension("lda", ["lda.pyx"],
include_dirs=[np.get_include()])
]
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = exts,
)
<强> test.py:强>
import numpy as np
from speedup import iterate
iteration = 10
M = 10
n_k_w = np.random.rand(10, 10)
n_m_k = np.random.rand(10, 10)
n_k = np.random.rand(10)
docSizes = np.zeros((10,), dtype=np.int32) + 10
numbered_docs = np.zeros((10, 10), dtype=np.int32) + 3
k_m_n = np.zeros((10, 10), dtype=np.int32) + 7
k_m_n_orig = k_m_n.copy()
iterate(iteration, M, n_k_w, n_m_k, n_k, docSizes, numbered_docs, k_m_n)
print k_m_n_orig[1]
print k_m_n[1]