我在python中实现了一种名为naiveKmeans的算法。但是太长了。
我试图将python代码转换为cython代码。我还键入了变量以加快算法。但是它仍然存在了很久。 Cython不能加速我的算法。
from scipy.spatial.distance import euclidean
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t
import time
cdef class NaiveKmeans:
cdef public np.ndarray data
cdef public np.ndarray assignment
cdef public np.ndarray clusterSize
cdef public np.ndarray centerMovement
cdef public np.ndarray centers
cdef public np.ndarray sumNewCenters
cdef public int n
cdef public int k
cdef public int d
def __init__(self,DTYPE_t kk,DTYPE_t nn, np.ndarray X, np.ndarray cent):
self.n = nn
self.k = kk
self.d = X.shape[1]
self.data = X
self.centers = cent
#For each point in x, keep which cluster it is assigned to. By using a
#short, we assume a limited number of clusters (fewer than 2^16).
self.assignment = np.full((self.n),-1,dtype=float)
# To communicate (to all threads) that we have converged.
#bool converged;
#Keep track of how many points are in each cluster, divided over each
#thread.
self.clusterSize = np.zeros(self.k)
#centerMovement is computed in move_centers() and used to detect
#convergence (if max(centerMovement) == 0.0) and update point-center
#distance bounds (in subclasses that use them).
#self.centerMovement = None
""" sumNewCenters and centerCount provide sufficient statistics to
// quickly calculate the changing locations of the centers. Whenever a
// point changes cluster membership, we subtract (add) it from (to) the
// row in sumNewCenters associated with its old (new) cluster. We also
// decrement (increment) centerCount for the old (new) cluster."""
self.sumNewCenters = np.zeros((self.k,self.d),dtype=float)
def changeAssignment(self,xIndex,closestCluster):
oldAssignment = self.assignment[xIndex]
self.clusterSize[self.assignment[xIndex]] -= 1
self.clusterSize[closestCluster] += 1
self.assignment[xIndex] = closestCluster
xp = self.data[xIndex]
self.sumNewCenters[oldAssignment] = np.subtract(self.sumNewCenters[oldAssignment],xp)
self.sumNewCenters[closestCluster] = np.add(self.sumNewCenters[closestCluster],xp)
def move_centers(self):
cdef int furthestMovingCenter = 0
self.centerMovement = np.zeros(self.k,dtype=float)
for j in range(self.k):
if (self.clusterSize[j] > 0):
z = np.divide(self.sumNewCenters[j],self.clusterSize[j])
#self.old_centers[j] = z
#self.centerMovement[j] = np.sqrt(np.square( np.subtract(z,self.centers[j]) ))
self.centerMovement[j] = euclidean(z,self.centers[j])
self.centers[j] = z
#self.centerMovement[j] = np.sqrt(self.centerMovement[j])
#print("shape fur naiv",self.centerMovement[furthestMovingCenter].shape,self.centerMovement[furthestMovingCenter])
print("cem",self.centerMovement,self.centerMovement[2])
if (self.centerMovement[furthestMovingCenter] < self.centerMovement[j]):
furthestMovingCenter = j
return furthestMovingCenter
def runThread(self,maxIterations):
#track the number of iterations the algorithm performs
iterations = 0
converged = False
while ((iterations < maxIterations) and (not converged)):
#print("ce nv",self.centers)
st = time.time()
iterations+=1
#loop over all examples
for i in range(self.n):
# look for the closest center to this example
closest = 0
closestDist2 = np.inf
for j in range(self.k):
d2 = euclidean(self.data[i], self.centers[j]);
if (d2 < closestDist2):
closest = j
closestDist2 = d2
if (self.assignment[i] != closest):
self.changeAssignment(i, closest)
print("temps iter cython nv",time.time()-st)
furthestMovingCenter = self.move_centers()
converged = (0.0 == self.centerMovement[furthestMovingCenter])
#print(converged,converged.all())
return iterations
from distutils.core import setup, Extension
from Cython.Build import cythonize
import numpy
setup(
ext_modules=cythonize(Extension('nv', ["nv.pyx"], include_dirs=[numpy.get_include()]))
)
from sklearn.datasets.samples_generator import make_classification
from nv import NaiveKmeans
k=5
n=40000
(data,y) = make_classification(n_samples=n,n_features=2, n_redundant=0,n_clusters_per_class=1)
# define initial centroids (points obtained from data)
centers = _k_init(X=data, n_clusters=k)
NV=NaiveKmeans(k,n,data,centers)
start = time.time()
print(NV.runThread(100))
print("time NV",time.time()-start)
该算法在cython中的每次迭代大约需要2.5秒,这对于仅包含40000个二维点的数据集来说太长了。它花费的时间与纯python中的时间相同。我不知道cython为何无法加快程序速度。