我的代码-
import xlrd,statistics,numpy
loc= r"Re.xlsx"
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0);
x=[]
def get_col_index(u):
for j in range(sheet.ncols):
if sheet.cell_value(0, j)== u:
return j
def get_model_list():
p=[]
f= get_col_index('bikeModel')
print(f)
for j in range(sheet.nrows):
c= sheet.cell_value(j,f)
if c not in p:
if c!='':
p.append(c)
return p
def averag(x):
bd= sum(x)
ct= len(x)
av= bd/ct
return av
def k_means(x):
serviceType_len= len(x)
if len(x)<=2:
print([numpy.average(x), numpy.std(x)])
return [numpy.average(x), numpy.std(x)]
#arrange data list and stored in 'y'
y=sorted(x)
mini=min(y)
maxi=max(y)
ran=maxi-mini
#get initial centroids c1, c2, c3, c4 in C
q1=[]
q2=[]
q3=[]
q4=[]
for i in y:
if i<= (ran/4+mini):
q1.append(i)
elif i<=(mini+ran/2):
q2.append(i)
elif i<=(mini+3*ran/4):
q3.append(i)
else:
q4.append(i)
Cd=[]
if len(q1)!=0:
q1.sort()
c1=[]
Cd.append(q1[0])
else:
c1='NULL'
if len(q2)!=0:
q2.sort()
c2=[]
Cd.append(q2[0])
else:
c2='NULL'
if len(q3)!=0:
q3.sort()
c3=[]
Cd.append(q3[0])
else:
c3='NULL'
if len(q4)!=0:
q4.sort()
c4=[]
Cd.append(q4[0])
else:
c4='NULL'
print(Cd)
cl=[]
rty= cluster(y,Cd,cl)
return rty
def cluster(y,Cd,cl):
if len(y)<=2:
return numpy.average(y)
else:
ncl=[]
ocl=cl
ncl1=[]
ncl2=[]
ncl.append(ncl1)
ncl.append(ncl2)
if len(Cd)>2:
ncl3=[]
ncl.append(ncl3)
if len(Cd)>3:
ncl4=[]
ncl.append(ncl4)
pc=0
for i in y:
print(Cd[0])
diff=abs(i-Cd[0])
for k in Cd:
if abs(i-k)<diff:
diff= abs(i-k)
pc=Cd.index(k)
ncl[pc].append(i)
print("appended")
if len(Cd)>=1:
Cd[0]= averag(ncl[0])
print(Cd[0])
if len(Cd)>=2:
Cd[1]= averag(ncl2)
if len(Cd)>2:
Cd[2]= averag(ncl3)
if len(Cd)>3:
Cd[3]=averag(ncl4)
if ocl==ncl:
ncl.sort(key=len)
ncl.reverse()
ncl[0].sort()
ncl[1].sort()
if len(Cd)>2:
ncl[2].sort()
if len(Cd)>3:
ncl[3].sort()
ll=[ncl[0]]
for we in ncl:
if we!=ncl[0]:
if ((len(ncl[0])- len(we)))<20*len(y)/100:
ll.append(ncl[ncl.index(we)])
finl=[]
for h in ll:
for b in h:
finl.append(b)
finl.sort()
countr=0
if numpy.average(finl) <= numpy.std(finl) or numpy.std(finl)>= 0.50 * numpy.average(finl):
print("kmeans called again")
if countr<=5:
k_means(finl)
countr=countr+1
else:
print([numpy.average(finl),numpy.std(finl)] )
else:
cluster(y,Cd,ncl)
serviceTypeIndex= get_col_index('jobType')
totalAmountIndex= get_col_index('rate')
bikeModelIndex=get_col_index('bikeModel')
serviceTypeList= ['Paid Job','Warranty Repair','Free Service']
gml= get_model_list()
n=len(gml)
gml.remove('bikeModel')
for k in gml:
cost1=[]
cost2=[];cost3=[];cost4=[];cost5=[];cost6=[];cost7=[];cost8=[]
r=[cost1,cost2,cost3,cost4,cost5,cost6,cost7,cost8]
for i in range(sheet.nrows):
if sheet.cell_value(i,bikeModelIndex)==k:
if sheet.cell_value(i,serviceTypeIndex) in serviceTypeList:
ind= serviceTypeList.index(sheet.cell_value(i,serviceTypeIndex))
c= sheet.cell_value(i,totalAmountIndex)
r[ind].append(c)
print( "Information of "+k+" :")
print("")
for h in serviceTypeList:
index= serviceTypeList.index(h)
z= r[index]
if len(z)!=0 :
print("Is the effective cost and standard deviation of "+h+" "+ str(k_means(r[index])))
print("")
我正在尝试使用递归方法来实现k均值。我认为问题在于else条件簇(y,Cd,ncl)中的簇函数,直到达到深度为止,它会不断被调用。
回溯-
print("Is the effective cost and standard deviation of "+h+" "+ str(k_means(r[index])))
File "abcdef.py", line 92, in k_means
rty= cluster(y,Cd,cl)
File "abcdef.py", line 180, in cluster
cluster(y,Cd,ncl)
File "abcdef.py", line 180, in cluster
cluster(y,Cd,ncl)
File "abcdef.py", line 180, in cluster
cluster(y,Cd,ncl)
从回溯中我们可以看到,簇(y,Cd,ncl)再次被再次调用n。