Question

i have found this code:

# kmeans clustering algorithm
# data = set of data points
# k = number of clusters
# c = initial list of centroids (if provided)
#
d
            clusters[mu_index] = [instance]
ef kmeans(data, k, c):
    centroids = []

    centroids = randomize_centroids(data, centroids, k)  

    old_centroids = [[] for i in range(k)] 

    iterations = 0
    while not (has_converged(centroids, old_centroids, iterations)):
        iterations += 1

        clusters = [[] for i in range(k)]

        # assign data points to clusters
        clusters = euclidean_dist(data, centroids, clusters)

        # recalculate centroids
        index = 0
        for cluster in clusters:
            old_centroids[index] = centroids[index]
            centroids[index] = np.mean(cluster, axis=0).tolist()
            index += 1


    print("The total number of data instances is: " + str(len(data)))
    print("The total number of iterations necessary is: " + str(iterations))
    print("The means of each cluster are: " + str(centroids))
    print("The clusters are as follows:")
    for cluster in clusters:
        print("Cluster with a size of " + str(len(cluster)) + " starts here:")
        print(np.array(cluster).tolist())
        print("Cluster ends here.")

    return

# Calculates euclidean distance between
# a data point and all the available cluster
# centroids.      
def euclidean_dist(data, centroids, clusters):
    for instance in data:  
        # Find which centroid is the closest
        # to the given data point.
        mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \
                            for i in enumerate(centroids)], key=lambda t:t[1])[0]
        try:
            clusters[mu_index].append(instance)
        except KeyError:
            clusters[mu_index] = [instance]

    # If any cluster is empty then assign one point
    # from data set randomly so as to not have empty
    # clusters and 0 means.        
    for cluster in clusters:
        if not cluster:
            cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())

    return clusters


# randomize initial centroids
def randomize_centroids(data, centroids, k):
    for cluster in range(0, k):
        centroids.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())
    return centroids


# check if clusters have converged    
def has_converged(centroids, old_centroids, iterations):
    MAX_ITERATIONS = 1000
    if iterations > MAX_ITERATIONS:
        return True
    return old_centroids == centroids


i have added this section of code to read my data in csv format:

import csv
reader = csv.reader(open("values1.csv", "r"))
for data in reader:
    print (data)
def kmeans(data, k, c):
    centroids = []

    centroids = randomize_centroids(data, centroids, k)  

    old_centroids = [[] for i in range(k)] 

    iterations = 0
    while not (has_converged(centroids, old_centroids, iterations)):
        iterations += 1

        clusters = [[] for i in range(k)]

        # assign data points to clusters
        clusters = euclidean_dist(data, centroids, clusters)

        # recalculate centroids
        index = 0
        for cluster in clusters:
            old_centroids[index] = centroids[index]
            centroids[index] = np.mean(cluster, axis=0).tolist()
            index += 1


    print("The total number of data instances is: " + str(len(data)))
    print("The total number of iterations necessary is: " + str(iterations))
    print("The means of each cluster are: " + str(centroids))
    print("The clusters are as follows:")
    for cluster in clusters:
        print("Cluster with a size of " + str(len(cluster)) + " starts here:")
        print(np.array(cluster).tolist())
        print("Cluster ends here.")

    return

# Calculates euclidean distance between
# a data point and all the available cluster
# centroids.      
def euclidean_dist(data, centroids, clusters):
    for instance in data:  
        # Find which centroid is the closest
        # to the given data point.
        mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \
                            for i in enumerate(centroids)], key=lambda t:t[1])[0]
        try:
            clusters[mu_index].append(instance)
        except KeyError:
    # If any cluster is empty then assign one point
    # from data set randomly so as to not have empty
    # clusters and 0 means.        
    for cluster in clusters:
        if not cluster:
            cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())

    return clusters


# randomize initial centroids
def randomize_centroids(data, centroids, k):
    for cluster in range(0, k):
        centroids.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())
    return centroids


# check if clusters have converged    
def has_converged(centroids, old_centroids, iterations):
    MAX_ITERATIONS = 1000
    if iterations > MAX_ITERATIONS:
        return True
    return old_centroids == centroids

因为我对这个编程语言python很新...这个程序只读取数据..我不知道如何在python中调用一个函数。我只是想在我的数据集中应用这个实现.. < / p>

我的数据集如下：

Districts,MTP -public,MTP-private
Nicobar,2,0
North&Middle Andaman,0,0
South Andaman,124,0
Adilabad,191,27
Anantapur,99,185
Chitoor,264,0
cudappah,40,48
East Godavari,190,14
Guntur,145,0
Hyderabad,1331,578
Karim Nagar,54,175
Khammam,2,0
Krishna,162,113
Kurnool,50,0
Mahbubnagar,43,12
Medak,81,1086
Nalgonda,23,2
Nellore,62,0
Nizamabad,336,24
Prakasam,0,0
Ranga Reddy,0,0
Srikakulam,708,24
Vishakapatnam,151,0
Vizianagaram,128,192
Warangal,245,2
West Godavari,7,0
Anjaw,30,0
Changlang,53,0
Dibang valley,0,0
East Kameng,0,0
East Siang,123,8
Kurung Kamey,3,0
lohit,220,0
Lower  dinag valley,13,0
lower subansiri,218,0
papum pare,189,0
Tawang,56,0
Tirap,56,0
Upper Siang,48,0
Upper subansiri,48,0
West kameng,79,0
West Siang,48,0
Baksa,568,0
Barpeta,1474,0
Bongaigaon,1836,0
Cachar,1945,0
Chirang,116,2
Darrang,5094,0
Dhemaji,9124,0
Dhubri,2402,0
Dibrugarh,2431,0
Goalpara,3548,2
Golaghat,736,0
Hailakandi,431,0
Jorhat,4593,4
Kamrup_M,3534,366
Kamrup_R,4151,0
Karbi_Anglong,484,0
Karimganj,1029,375
Kokrajhar,1745,0
Lakhimpur,6279,0
Marigaon,523,0
Nagaon,4629,0
Nalbari,3368,1189
North_Cachar_Hills,371,0
Sibsagar,2047,0
Sonitpur,3371,126
Tinsukia,2385,69
Udalguri,461,56
BANKURA( 10-11),268,59
BARDHAMAN(10-11),1461,5486
BIRBHUM(10-11),77,5
DAKSHINI DINAJPUR(10-11),2931,0
DARJEELING(10-11),744,85
HAORA(10-11),598,3894
HOOGLY(10-11),1436,0
JALPAIGURI(10-11),2309,111
KOCH BIHAR(10-11),610,34
KOLKATA(10-11),2317,12632
MALDA(10-11),1949,71
MEDINIPUR EAST(10-11),951,297
MEDINIPUR WEST(10-11),1422,215
MURSHIDABAD(10-11),1711,215
NADAI(10-11),2773,4063
NORTH24PARGANAS(10-11),2388,2070
PURULIYA(10-11),789,1507
SOUTH24PARGANAS(10-11),1884,349
UTTAR DINAJPUR(10-11),1037,26
Anugul,650,9
Balangir,374,0
Baleshwar,728,0
Bargarh,651,202
Baudh,99,0
Bhadrak,1483,0
Cuttak,859,0
Deogarh,59,0
Dhenkana,886,187
Gajapati,104,0
Ganjam,698,0
Jagatsinghpur,336,0
Jajapur,757,3
Jharsuguda,77,0
Kalahandi,75,0
Kandhamal,468,0
Kendrapara,659,0
Keonjhar,1234,0
Khordha,665,0
Koraput,205,0
Malkangiri,311,0
Mayurbhanj,2332,0
Nabarangapur,4,0
Nayagarh,536,0
Nuapada,225,0
puri,600,0
Rayagada,145,21
Sambalpur,258,0
Sonapur,741,0
Sundargarh,832,0

please guide me if possible

python

0 个答案: