为什么cython不能加快我的python代码？

时间：2019-04-30 11:31:41

标签： python python-3.x performance cython cythonize

我在python中实现了一种名为naiveKmeans的算法。但是太长了。

我试图将python代码转换为cython代码。我还键入了变量以加快算法。但是它仍然存在了很久。 Cython不能加速我的算法。

实现naiveKmeans算法的cython代码

from scipy.spatial.distance import euclidean
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t


import time

cdef class NaiveKmeans:
    cdef public np.ndarray data
    cdef public np.ndarray assignment
    cdef public np.ndarray clusterSize
    cdef public np.ndarray centerMovement
    cdef public np.ndarray centers
    cdef public np.ndarray sumNewCenters
    cdef public int n
    cdef public int k
    cdef public int d
    def __init__(self,DTYPE_t kk,DTYPE_t nn, np.ndarray X, np.ndarray cent):
        self.n = nn
        self.k = kk
        self.d = X.shape[1]
        self.data = X
        self.centers = cent
        #For each point in x, keep which cluster it is assigned to. By using a
        #short, we assume a limited number of clusters (fewer than 2^16).
        self.assignment = np.full((self.n),-1,dtype=float)
        # To communicate (to all threads) that we have converged.
        #bool converged;
        #Keep track of how many points are in each cluster, divided over each
        #thread.
        self.clusterSize = np.zeros(self.k)
        #centerMovement is computed in move_centers() and used to detect
        #convergence (if max(centerMovement) == 0.0) and update point-center
        #distance bounds (in subclasses that use them).
        #self.centerMovement = None

        """ sumNewCenters and centerCount provide sufficient statistics to
        // quickly calculate the changing locations of the centers. Whenever a
        // point changes cluster membership, we subtract (add) it from (to) the
        // row in sumNewCenters associated with its old (new) cluster. We also
        // decrement (increment) centerCount for the old (new) cluster."""
        self.sumNewCenters = np.zeros((self.k,self.d),dtype=float)

    def changeAssignment(self,xIndex,closestCluster):
        oldAssignment = self.assignment[xIndex]
        self.clusterSize[self.assignment[xIndex]] -= 1
        self.clusterSize[closestCluster] += 1
        self.assignment[xIndex] = closestCluster
        xp = self.data[xIndex]
        self.sumNewCenters[oldAssignment] = np.subtract(self.sumNewCenters[oldAssignment],xp) 
        self.sumNewCenters[closestCluster] = np.add(self.sumNewCenters[closestCluster],xp)

    def move_centers(self):
        cdef int furthestMovingCenter = 0
        self.centerMovement = np.zeros(self.k,dtype=float)
        for j in range(self.k):
            if (self.clusterSize[j] > 0):
                    z = np.divide(self.sumNewCenters[j],self.clusterSize[j])
                    #self.old_centers[j] = z
                    #self.centerMovement[j] = np.sqrt(np.square( np.subtract(z,self.centers[j]) ))
                    self.centerMovement[j] = euclidean(z,self.centers[j]) 
                    self.centers[j] = z
                    #self.centerMovement[j] = np.sqrt(self.centerMovement[j])
            #print("shape fur naiv",self.centerMovement[furthestMovingCenter].shape,self.centerMovement[furthestMovingCenter])
            print("cem",self.centerMovement,self.centerMovement[2])
            if (self.centerMovement[furthestMovingCenter] < self.centerMovement[j]):
                furthestMovingCenter = j
        return furthestMovingCenter

    def runThread(self,maxIterations):
        #track the number of iterations the algorithm performs
        iterations = 0
        converged = False
        while ((iterations < maxIterations) and (not converged)):
            #print("ce nv",self.centers)
            st = time.time()
            iterations+=1
            #loop over all examples
            for i in range(self.n):
                # look for the closest center to this example
                closest = 0
                closestDist2 = np.inf
                for j in range(self.k):
                    d2 = euclidean(self.data[i], self.centers[j]);
                    if (d2 < closestDist2):
                        closest = j
                        closestDist2 = d2
                if (self.assignment[i] != closest):
                    self.changeAssignment(i, closest)
            print("temps iter cython nv",time.time()-st)
            furthestMovingCenter = self.move_centers()
            converged = (0.0 == self.centerMovement[furthestMovingCenter])
            #print(converged,converged.all())


        return iterations

设置

from distutils.core import setup, Extension
from Cython.Build import cythonize
import numpy 

setup(
        ext_modules=cythonize(Extension('nv', ["nv.pyx"], include_dirs=[numpy.get_include()]))

)

主程序python调用NaiveKmeans

from sklearn.datasets.samples_generator import make_classification
from nv import NaiveKmeans
k=5
n=40000

(data,y) = make_classification(n_samples=n,n_features=2, n_redundant=0,n_clusters_per_class=1)


# define initial centroids (points obtained from data) 
centers = _k_init(X=data, n_clusters=k)


NV=NaiveKmeans(k,n,data,centers)
start = time.time()
print(NV.runThread(100))
print("time NV",time.time()-start)

该算法在cython中的每次迭代大约需要2.5秒，这对于仅包含40000个二维点的数据集来说太长了。它花费的时间与纯python中的时间相同。我不知道cython为何无法加快程序速度。

0 个答案:

没有答案