Question

我在python 3中进行群集时遇到以下错误

ValueError                                Traceback (most recent call last)
<ipython-input-87-01d4fa194338> in <module>()
     27     return  thecentroids, clusteringA
     28 
---> 29 thecentroids, clusteringA  = topterms(cmd, 5,100)

<ipython-input-87-01d4fa194338> in topterms(dataset, k, N)
      1 def topterms(dataset, k,N):
----> 2     thecentroids, clusteringA = kMeans(mat(dataset), k,distMeas=cosine, createCent=randCent)
      3     dfclusteringA = pd.DataFrame(clusteringA,index=dataset.index,copy=True)
      4     dfthecentroids = pd.DataFrame(thecentroids)
      5     for cluster in range(k):

C:\Users\farha\Anaconda3\lib\site-packages\numpy\matrixlib\defmatrix.py in asmatrix(data, dtype)
     96 
     97     """
---> 98     return matrix(data, dtype=dtype, copy=False)
     99 
    100 def matrix_power(M, n):

C:\Users\farha\Anaconda3\lib\site-packages\numpy\matrixlib\defmatrix.py in __new__(subtype, data, dtype, copy)
    287         ret = N.ndarray.__new__(subtype, shape, arr.dtype,
    288                                 buffer=arr,
--> 289                                 order=order)
    290         return ret
    291 

ValueError: ndarray is not contiguous

Answer 1

from numpy import *

def distEclud(vecA, vecB):
    return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)

def randCent(dataSet, k):
    n = shape(dataSet)[1]
    centroids = zeros((k,n), dtype=float)
    for j in range(n): #create random cluster centers
        minJ = min(dataSet[:,j])
        rangeJ = float(max(dataSet[:,j]) - minJ)
        centroids[:,j] = minJ + rangeJ * random.rand(k)
    return centroids 

def kMeans(dataSet, k, distMeas=cosine, createCent=randCent):
    m = shape(dataSet)[0]
    clusterAssment = zeros((m,2))#create mat to assign data points 
                                 #to a centroid, also holds SE of each point
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):#for each data point assign it to the closest centroid
            minDist = inf; minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j,:],dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI; minIndex = j
            if clusterAssment[i,0] != minIndex: clusterChanged = True
               clusterAssment[i,:] = minIndex,minDist**2
        # print centroids
        for cent in range(k):#recalculate centroids
            ptsInClust = dataSet[nonzero(clusterAssment[:,0]==cent)[0]] 
            if(len(ptsInClust)!=0):
                centroids[cent,:] = mean(ptsInClust, axis=0) 
    return centroids, clusterAssment

def topterms(dataset, k,N):
    thecentroids, clusteringA = kMeans(mat(dataset),k,distMeas=cosine,createCent=randCent)
    dfclusteringA = pd.DataFrame(clusteringA,index=dataset.index,copy=True)
    dfthecentroids = pd.DataFrame(thecentroids)
    for cluster in range(k):
        print ('Cluster', cluster)
        chosen = dfclusteringA[dfclusteringA[0] == cluster]
        print ('Cluster capacity' , len(chosen.index))
        if len(chosen.index) == 0:
            print ('Empty Cluster')
            continue
        documents = dataset.ix[chosen.index]
        documentarray = np.array(documents)
        DF = np.array([(documentarray.T!=0).sum(1)]).T.flatten()
        totaldocuments = len(chosen.index)
        chos= pd.Series(DF)
        chos = chos.sort_values()
        chos = chos[:N]
        chosen_words = tdata.ix[chos.index]
        text = ''
        print ('Descending terms')
        for j in range(len(chos.index)):
            print ('Term:',chosen_words.iloc[j][0],'cluster',chos.iloc[j],'percentage of document',np.divide(chos.iloc[j],float(totaldocuments)))                            
            term =  chosen_words.iloc[j][0]
            term_count = chos.iloc[j]
            text += ' '.join([term  for word in range(term_count)])+' '   
    return  thecentroids, clusteringA 

thecentroids, clusteringA  = topterms(cmd, 5,100)

ValueError：ndarray不是连续的python 3

1 个答案: