如何在Python中实现适用于N维特征向量的GMM聚类EM算法(期望最大化算法)

时间:2018-10-19 07:23:13

标签: python machine-learning scikit-learn k-means gmm

我正在尝试为24维特征向量和32维特征向量实现GMM聚类,其中初始参数的分配由Kmeans algorightm完成(K均值聚类提供聚类中心-仅MU-)。 我正在关注this link,在此仅针对2D特征向量和预定义的Mu和sigma实现。

如果有人拥有GMM集群的代码,请发表。

sklearn中也有用于GMM的预定义库,但是它没有给我每次迭代的可能性。 sklearn GMM

1 个答案:

答案 0 :(得分:0)

def kmeans(dataSet, k, c):
    # 1. Randomly choose clusters
    rng = np.random.RandomState(c)
    p = rng.permutation(dataSet.shape[0])[:k]
    centers = dataSet[p]

    while True:
        labels = pairwise_distances_argmin(dataSet, centers)
        new_centers = np.array([dataSet[labels == i].mean(0) for i in range(k)]
        if np.all(centers == new_centers):
            break
        centers = new_centers
    cluster_data = [dataSet[labels == i] for i in range(k)]
    l = []
    covs = []
    for i in range(k):
        l.append(len(cluster_data[i]) * 1.0 / len(dataSet))
        covs.append(np.cov(np.array(cluster_data[i]).T))
    return centers, l, covs, cluster_data


return new_mu, new_covs, cluster_data


class gaussian_Mix_Model:

    def __init__(self, k = 8, eps = 0.0000001):
        self.k = k ## number of clusters
        self.eps = eps ## threshold to stop `epsilon`


    def calculate_Exp_Maxim(self, X, max_iters = 1000):

        # n = number of data-points, d = dimension of data points        
        n, d = X.shape

        mu, Cov = [], []
        for i in range(1,k):
            new_mu, new_covs, cluster_data = kmeans(dataSet, k, c)
            # Initialize new         
            mu[k] = new_mu
            Cov[k]= new_cov

            # initialize the weights
            w = [1./self.k] * self.k

            R = np.zeros((n, self.k))

            ### LLhoods
            LLhoods = []

            P = lambda mu, s: np.linalg.det(s) ** -.5 ** (2 * np.pi) ** (-X.shape[1]/2.) \
                * np.exp(-.5 * np.einsum('ij, ij -> i',\
                        X - mu, np.dot(np.linalg.inv(s) , (X - mu).T).T ) ) 

            # Iterate till max_iters iterations        
            while len(LLhoods) < max_iters:

            # Expectation Calcultion 

            ## membership for each of K Clusters
            for k in range(self.k):
                R[:, k] = w[k] * P(mu[k], Cov[k])

            # Finding the log likelihood
            LLhood = np.sum(np.log(np.sum(R, axis = 1)))

            # Now store the log likelihood to the list. 
            LLhoods.append(LLhood)

            # Number of data points to each clusters
            R = (R.T / np.sum(R, axis = 1)).T                   
            N_ks = np.sum(R, axis = 0)


            # Maximization and calculating the new parameters. 
            for k in range(self.k):

                # Calculate the new means
                mu[k] = 1. / N_ks[k] * np.sum(R[:, k] * X.T, axis = 1).T
                x_mu = np.matrix(X - mu[k])

                # Calculate new cov
                Cov[k] = np.array(1 / N_ks[k] * np.dot(np.multiply(x_mu.T,  R[:, k]), x_mu))

                # Calculate new PiK
                w[k] = 1. / n * N_ks[k]
            # check for convergence
            if (np.abs(LLhood - LLhoods[-2]) < self.eps) and (iteration < max_iters): break
            else:
                Continue

    from collections import namedtuple
    self.params = namedtuple('params', ['mu', 'Cov', 'w', 'LLhoods', 'num_iters'])
    self.params.mu = mu
    self.params.Cov = Cov
    self.params.w = w
    self.params.LLhoods = LLhoods
    self.params.num_iters = len(LLhoods)       

    return self.params

# Call the GMM to find the model 
gmm = gaussian_Mix_Model(3, 0.000001)
params = gmm.fit_EM(X, max_iters= 150)

# Plotting of Log-Likelihood VS Iterations. 
plt.plot(LLhoods[0])
plt.savefig('Dataset_2A_GMM_Class_1_K_16.png')
plt.clf()
plt.plot(LLhoods[1])
plt.savefig('Dataset_2A_GMM_Class_2_K_16.png')
plt.clf()
plt.plot(LLhoods[2])
plt.savefig('Dataset_2A_GMM_Class_3_K_16.png')
plt.clf()