我在python 3中进行群集时遇到以下错误
ValueError Traceback (most recent call last)
<ipython-input-87-01d4fa194338> in <module>()
27 return thecentroids, clusteringA
28
---> 29 thecentroids, clusteringA = topterms(cmd, 5,100)
<ipython-input-87-01d4fa194338> in topterms(dataset, k, N)
1 def topterms(dataset, k,N):
----> 2 thecentroids, clusteringA = kMeans(mat(dataset), k,distMeas=cosine, createCent=randCent)
3 dfclusteringA = pd.DataFrame(clusteringA,index=dataset.index,copy=True)
4 dfthecentroids = pd.DataFrame(thecentroids)
5 for cluster in range(k):
C:\Users\farha\Anaconda3\lib\site-packages\numpy\matrixlib\defmatrix.py in asmatrix(data, dtype)
96
97 """
---> 98 return matrix(data, dtype=dtype, copy=False)
99
100 def matrix_power(M, n):
C:\Users\farha\Anaconda3\lib\site-packages\numpy\matrixlib\defmatrix.py in __new__(subtype, data, dtype, copy)
287 ret = N.ndarray.__new__(subtype, shape, arr.dtype,
288 buffer=arr,
--> 289 order=order)
290 return ret
291
ValueError: ndarray is not contiguous
答案 0 :(得分:-1)
from numpy import *
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = zeros((k,n), dtype=float)
for j in range(n): #create random cluster centers
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = minJ + rangeJ * random.rand(k)
return centroids
def kMeans(dataSet, k, distMeas=cosine, createCent=randCent):
m = shape(dataSet)[0]
clusterAssment = zeros((m,2))#create mat to assign data points
#to a centroid, also holds SE of each point
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):#for each data point assign it to the closest centroid
minDist = inf; minIndex = -1
for j in range(k):
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
# print centroids
for cent in range(k):#recalculate centroids
ptsInClust = dataSet[nonzero(clusterAssment[:,0]==cent)[0]]
if(len(ptsInClust)!=0):
centroids[cent,:] = mean(ptsInClust, axis=0)
return centroids, clusterAssment
def topterms(dataset, k,N):
thecentroids, clusteringA = kMeans(mat(dataset),k,distMeas=cosine,createCent=randCent)
dfclusteringA = pd.DataFrame(clusteringA,index=dataset.index,copy=True)
dfthecentroids = pd.DataFrame(thecentroids)
for cluster in range(k):
print ('Cluster', cluster)
chosen = dfclusteringA[dfclusteringA[0] == cluster]
print ('Cluster capacity' , len(chosen.index))
if len(chosen.index) == 0:
print ('Empty Cluster')
continue
documents = dataset.ix[chosen.index]
documentarray = np.array(documents)
DF = np.array([(documentarray.T!=0).sum(1)]).T.flatten()
totaldocuments = len(chosen.index)
chos= pd.Series(DF)
chos = chos.sort_values()
chos = chos[:N]
chosen_words = tdata.ix[chos.index]
text = ''
print ('Descending terms')
for j in range(len(chos.index)):
print ('Term:',chosen_words.iloc[j][0],'cluster',chos.iloc[j],'percentage of document',np.divide(chos.iloc[j],float(totaldocuments)))
term = chosen_words.iloc[j][0]
term_count = chos.iloc[j]
text += ' '.join([term for word in range(term_count)])+' '
return thecentroids, clusteringA
thecentroids, clusteringA = topterms(cmd, 5,100)