我正尝试在Python 3中实现EM算法,以找到最适合某些实验数据分布的beta分布混合参数的ML估计量。在这里,我尝试使用模拟数据评估代码的正确性。
以下是我的代码,它是从link改编而来的,并使用了[Schröder and Rahmann, 2017]中所谓的迭代矩方法:
import numpy as np
from scipy.stats import beta
def em_gmm_orig(xs, pis, alphas, betas, tol=0.01, max_iter=100):
n = len(xs)
ll_old = 0
for i in range(max_iter):
ll_new = 0
# E-step
ws = np.zeros((2, n))
for j in range(2):
for i in range(n):
ws[j, i] = pis[j]*beta(alphas[j], betas[j]).pdf(xs[i])
ws /= ws.sum(0)
# M-step
pis = np.zeros(2)
for j in range(2):
for i in range(n):
pis[j] += ws[j, i]
pis[j] /= n
mus = np.zeros(2)
for j in range(2):
for i in range(n):
mus[j] += ws[j, i]*xs[i]
mus[j] /= n*pis[j]
sigmas = np.zeros(2)
for j in range(2):
for i in range(n):
ys = xs[i] - mus[j]
sigmas[j] += ws[j, i]*(ys**2)
sigmas[j] /= n*pis[j]
# Obtain alphas and betas from means and variances
phis = [(mu*(1-mu)/sigma - 1) for mu, sigma in zip(mus,sigmas)]
alphas = [mu*phi for mu, phi in zip(mus,phis)]
betas = [(1-mu)*phi for mu, phi in zip(mus,phis)]
# Update complete loglikelihood
ll_new = 0.0
for i in range(n):
s = 0
for j in range(2):
s += pis[j] * beta(alphas[j], betas[j]).pdf(xs[i])
ll_new += np.log(s)
if np.abs(ll_new - ll_old) < tol:
break
ll_old = ll_new
return ll_new, pis, alphas, betas
# Ground truth
_alphas = np.array([1, 5])
_betas = np.array([7, 2])
_pis = np.array([0.2, 0.8])
# Initial random guesses for parameters
np.random.seed(0)
pis = np.random.random(2,)
pis /= pis.sum()
alphas = np.random.random(2,)
betas = np.random.random(2,)
# Generate data
n = 1000
xs = np.concatenate([np.random.beta(alpha, beta, int(pi*n)) for pi, alpha, beta in zip(_pis, _alphas, _betas)])
# Call EM
ll, pis, alphas, betas = em_gmm_orig(xs, pis, alphas, betas)
print(ll)
print(pis)
print(alphas)
print(betas)
为什么我获得的参数估算值与用于生成数据的估算值相差甚远?