请我有一个Kmeans程序,我想修改它以便它是增量kmeans。
我怎么能这样做。
提前谢谢
enter code here
在此处输入代码from scipy.spatial import distance
在此输入代码`import numpy as np
`enter code here`import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
#import tkinter as tk
from tkinter import *
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import adjusted_rand_score
import matplotlib.cm as cm
np.seterr(divide='ignore', invalid='ignore')
style.use('ggplot')
x=0
y=0
silhouette_avg = []
cluster_err = []
clusters_df=[]
adj_rand_score = []
class K_Means:
def __init__(self, k = 3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
self.itr = 0
def manhattan(a,b):
return sum(abs(x - y) for x,y in zip(a, b))
def fit(self, data, dist):
self.centroids = {}
self.dist = dist
#Initializer les centroids par les 'k' premiers elements du dataset
for i in range(self.k):
self.centroids[i] = data[i]
#self.centroids[i] = data[randint(1, taille)]
# Début des iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] = []
#Chercher la distance entre point et cluster; choix du plus proche centroid
for features in data:
if dist == "euclidean":
distances = [distance.euclidean(features,self.centroids[centroid]) for centroid in self.centroids]
elif dist == "cosine":
distances = [distance.cosine(features,self.centroids[centroid]) for centroid in self.centroids]
elif dist == "manhattan":
distances = [K_Means.manhattan(features,self.centroids[centroid]) for centroid in self.centroids]
elif dist == "minkowski":
distances = [distance.minkowski(features,self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#Varier les points de données du cluster pour recalculer les centroïdes
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if len(original_centroid)!=0:
if np.sum((curr - original_centroid)/(original_centroid) * 100.0) > self.tolerance:
isOptimal = False
self.itr += 1
#sortir de la boucle si les résultats sont optimaux, à savoir. les centroïdes ne changent pas beaucoup leurs positions (plus que la tolérance)
if isOptimal:
break