Kmeans用热图聚类

时间:2017-09-14 03:05:11

标签: python pandas numpy matplotlib heatmap

只是想知道但是你会如何使用k表示对这个数据集进行聚类? 我被限制使用任何包或模块。 https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv

这个数据集是对这个数据集的培训

https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/cancer.csv

一直试图解决这个问题,尝试了几件事,但似乎没有一件事能奏效。不需要代码,但如果有人能给我一个通用的思考过程来解决这个问题,我将非常感激。

这是我目前的思维方式。我试图将他的数据放入热图中 我目前的思考过程是首先随机选择中心。 然后为每个中心的距离创建一个列表列表。 找到每个中心每个点的最小距离索引。 创建与数据集大小相同的数据框,并使用该点最接近的中心索引填充每个元素的每个索引。 通过采用具有相同中心索引的点的平均值来重新计算中心 多次重复此过程,直到索引数据框不更改为止。 创建一个新数据框,并在框架中添加具有相同中心点的点。 然后创建热图。

这似乎不起作用。 只是想知道,我是在正确的轨道还是我完全关闭,如果我在正确的轨道上,我需要更改哪些部分才能解决问题。如果没有,请指出我在正确的轨道上。

以下是要查看的代码

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import random
#%matplotlib inline

def truncate(f, n):
    return math.floor(f * 10 ** n) / 10 ** n

def chooseCenter(data, centers):
    length = data.shape
    cent = []
    while len(cent) < centers :
        x = random.randrange(0,length[0])
        y = random.randrange(0,length[1])
        if data.iloc[x][y] not in cent:
            d = truncate(data.iloc[x][y],2)
            cent.append(d)
    return cent


def distance(val, center):
    return math.sqrt((val- center)**2)


def getDistances(centers, data):
    length = data.shape
    dist = []
    for i in range(length[0]):
        for j in range(length[1]):
            y = []
            for k in range(len(centers)):
                val = distance(data.iloc[i][j], centers[k]) 
                y.append(truncate(val,3))
            dist.append(y)
    return dist

def findClosest(data, dist):
    close = data.copy()
    length = close.shape
    indexes = []
    for i in range(len(dist)):
        pt = min(dist[i])
        idx = dist[i].index(pt)
        indexes.append(idx)
    #print(indexes)
    length = data.shape
    n = np.array(indexes)
    n = pd.DataFrame(np.reshape(n, (length[0],length[1])))
    #reshape this data frame into the same shape as the data
    #keep running the find closest until there is no change
    #try heatmap on this?
    #this should cluster it, but to make sure test it
    #might need to do some tweaking to this

    return n
#    for i in range(length[0]):
#        for j in range(length[1]):
#            print('dist[i]', dist[j])
#            pt = min(dist[j])
#            print(pt)
#            idx = dist[j].index(pt)
#            close.iloc[i][j] = int(idx)
    #return close

def computeNewCenter(data, close):
    d = dict()
    for i in range(len(close)):
        for j in range(len(close[0])):
            d[close.iloc[i][j]] = []

    for i in range(len(data)):
        for j in range(len(data[0])):
            if close.iloc[i][j] in d:
                d[close.iloc[i][j]].append(data.iloc[i][j])
    newCenters = []

    for key, value in d.items():
        m = np.mean(value)
        newCenters.append(truncate(m, 3))

    return newCenters
#    lst = [[] * numcenters]
#    for i in range(len(close)):
#        for j in range(len(close[0])):
#            if close.iloc[i][j]


def main():
    data = np.array(pd.read_csv('https://raw.githubusercontent.com/gsprint23/cpts215/master/progassignments/files/simple.csv',  header=None))
    data = data.T
    #print(data)
    df = pd.DataFrame(data[1:], columns=data[0], dtype=float).T
    df = df.iloc[::-1]
#    print(df)
#    print(df.iloc[1][9])
#    print(df)
#    print(df.iloc[0][1])
#    heatmap = plt.pcolor(df, cmap=plt.cm.bwr)
#    plt.colorbar(heatmap)
    c = chooseCenter(df, 3)
    print(c)
    #print(len(c))
    dist = getDistances(c, df)
    #print(dist)
    y = findClosest(df, dist)
#    q = []
#    for i in range(len(c)):
#        q.append([])
#    #print(q)
    j = computeNewCenter(df, y)
    #print(j)
    length = df.shape
    oldFrame = pd.DataFrame(np.ndarray((length[0],length[1])))
    oldFrame = oldFrame.fillna(0)
    ct=0
    while y.equals(oldFrame) == False:
        ct+=1
        oldFrame = y.copy()
        c = computeNewCenter(df, oldFrame)
        #print(c)
        dist = getDistances(c, df)
        #print(dist)
        y = findClosest(df, dist)
        #print(y)
    #plt.pcolor(df, cmap=plt.cm.bwr)

    l = []
    for i in range(len(y)):
        for j in range(len(y[0])):
            if y.iloc[i][j] == 1:
                l.append(df.iloc[i][j])

    for i in range(len(y)):
        for j in range(len(y[0])):
            if y.iloc[i][j] == 2:
                l.append(df.iloc[i][j])
    for i in range(len(y)):
        for j in range(len(y[0])):
            if y.iloc[i][j] == 0:
                 l.append(df.iloc[i][j])


    l = np.ndarray((length[0],length[1]))
    l = pd.DataFrame(l)
    print(l)
    hm = plt.pcolor(l, cmap=plt.cm.bwr)
    plt.colorbar(hm)    
#    print(y)
#    print(c)
#    print(ct)
    #plt.pcolor(y, cmap=plt.cm.bwr)


if __name__ == '__main__':
    main()

感谢您阅读

0 个答案:

没有答案