此post说明了如何估计司仪中的混合模型。但是,在这篇文章中,我并没有估算潜在的群集分配。而且我很难弄清楚怎么做。
说我想估计一个二项式混合。不在乎我们可以做的集群分配(在上面的帖子之后)
import emcee
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import corner
from scipy import stats
sns.set_context('paper')
sns.set_style('darkgrid')
plt.ioff()
# generate data from a binomial mixture
p_true = [0.3,0.5]
w_true = 0.5
N = 30
size = 200
B = np.random.binomial(N,np.where(np.random.rand(size)<w_true,*p_true))
# define the model:
# beta prior for the success probabilities
def lnprior_p(p):
return np.sum(stats.beta.logpdf(p,a=2,b=2))
# beta prior for the unconditional cluster probability
def lnprior_w(w):
return stats.beta.logpdf(w,a=1.2,b=1.2)
# likelihood of the data
def lnlike(theta,N,data):
p1,p2,w = theta
l1 = np.log(w) + stats.binom.logpmf(data,N,p1)
l2 = np.log(1-w) + stats.binom.logpmf(data,N,p2)
ll = np.sum(np.logaddexp(l1, l2))
return ll
# posterior probability up to a constant
def fullmodel(theta,N,data):
p1,p2,w = theta
bounds = [0,1]
if p1>=p2 or not all(bounds[0]<v<bounds[1] for v in theta):
post = -np.inf
else:
post = lnlike(theta,N,data) + lnprior_p((p1,p2)) + lnprior_w(w)
return post
# Initialize the model
ndim, nwalkers = 3, 8
start = [np.r_[np.random.uniform(0,.5),np.random.uniform(.5,1),np.random.rand(1)] \
for k in range(nwalkers)]
# Set up the sampler.
sampler = emcee.EnsembleSampler(nwalkers, ndim, fullmodel,args=(N,B))
# Run a burn-in chain and save the last location.
pos, _, _ = sampler.run_mcmc(start, 5000)
# Run the chain.
sampler.reset()
sampler.run_mcmc(pos, 5000);
# plot results
labels = ["$p1$", "$p2$", "$w$"]
corner.corner(sampler.flatchain, bins=35, labels=labels,truths=[*p_true,w_true]);
如何更改以上代码以获得聚类分配的后绘制? 我以为这应该骗人
# adding a prior for the cluster assignment
def lnprior_z(z,w):
return np.sum(stats.binom.logpmf(z,n=1,p=1-w)) # z=1 is the second cluster, hence p=1-w
# change likelihood to condition on the cluster assignment
def lnlike(theta,z,N,data):
p1,p2,w = theta
ll = np.sum(stats.binom.logpmf(data,N,np.where(z==0,p1,p2)))
return ll
# adapt the posterior prob and introduce the cluster assignments as additional paramters to be estimated
def fullmodel(theta,N,data):
p1,p2,w,z = theta[0],theta[1],theta[2],theta[3:]
bounds = [0,1]
if p1>=p2 or not all(bounds[0]<v<bounds[1] for v in theta[:3]):
post = -np.inf
else:
post = lnlike(theta[:3],z,N,data) + lnprior_z(z,w) + lnprior_p((p1,p2)) + lnprior_w(w)
return post
# and give starting values for the cluster assignments
w0 = np.random.rand(nwalkers)
start = [np.r_[np.random.uniform(0,.5),np.random.uniform(.5,1),w0[k], \
np.random.binomial(n=1,p=1-w0[k],size=size)] for k in range(nwalkers)]
但是,如果我这样做,我会估计3+size
个参数,而emcee
不会让我,除非我将步行者的数量增加到参数数量的两倍,但我不明白为什么。该代码运行了一段时间,并且输出只是胡说八道(如果我想绘制结果,它说:WARNING:root:Too few points to create valid contours
)。我究竟做错了什么? (此post描述了如何使用pymc3
中的群集分配来估计正常混合,但是我想使用emcee,部分是出于比较的原因,部分是因为我想扩展模型并使用{{ 3}})。