算法Kmeans聚类

时间:2018-06-05 21:32:39

标签: python-3.6

请我有一个Kmeans程序,我想修改它以便它是增量kmeans。

我怎么能这样做。

提前谢谢

enter code here在此处输入代码from scipy.spatial import distance 在此输入代码`import numpy as np

`enter code here`import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
#import tkinter as tk
from tkinter import *
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import adjusted_rand_score
import matplotlib.cm as cm
np.seterr(divide='ignore', invalid='ignore')

style.use('ggplot')
x=0
y=0
silhouette_avg = []
cluster_err = []
clusters_df=[]
adj_rand_score = []
class K_Means:
    def __init__(self, k = 3, tolerance = 0.0001, max_iterations = 500):
        self.k = k
        self.tolerance = tolerance
        self.max_iterations = max_iterations
        self.itr = 0

    def manhattan(a,b):
        return sum(abs(x - y) for x,y in zip(a, b))

    def fit(self, data, dist):

        self.centroids = {}
        self.dist = dist


        #Initializer les centroids par les 'k' premiers elements du dataset        
        for i in range(self.k):
            self.centroids[i] = data[i]
            #self.centroids[i] = data[randint(1, taille)]

        # Début des iterations
        for i in range(self.max_iterations):
            self.classes = {}
            for i in range(self.k):
                self.classes[i] = []

            #Chercher la distance entre point et cluster; choix du plus proche centroid
            for features in data:
                if dist == "euclidean":
                                distances = [distance.euclidean(features,self.centroids[centroid]) for centroid in self.centroids]
                elif dist == "cosine":
                                distances = [distance.cosine(features,self.centroids[centroid]) for centroid in self.centroids]
                elif dist == "manhattan":
                                distances = [K_Means.manhattan(features,self.centroids[centroid]) for centroid in self.centroids]
                elif dist == "minkowski":
                                distances = [distance.minkowski(features,self.centroids[centroid]) for centroid in self.centroids]

                classification = distances.index(min(distances))
                self.classes[classification].append(features)

            previous = dict(self.centroids)

            #Varier les points de données du cluster pour recalculer les centroïdes
            for classification in self.classes:
                self.centroids[classification] = np.average(self.classes[classification], axis = 0)

            isOptimal = True

            for centroid in self.centroids:

                original_centroid = previous[centroid]
                curr = self.centroids[centroid]
                if len(original_centroid)!=0:
                    if np.sum((curr - original_centroid)/(original_centroid) * 100.0) > self.tolerance:
                        isOptimal = False
            self.itr += 1
            #sortir de la boucle si les résultats sont optimaux, à savoir. les centroïdes ne changent pas beaucoup leurs positions (plus que la tolérance)
            if isOptimal:


        break

0 个答案:

没有答案