Question

此post说明了如何估计司仪中的混合模型。但是，在这篇文章中，我并没有估算潜在的群集分配。而且我很难弄清楚怎么做。

说我想估计一个二项式混合。不在乎我们可以做的集群分配（在上面的帖子之后）

import emcee
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import corner
from scipy import stats
sns.set_context('paper')
sns.set_style('darkgrid')
plt.ioff()

# generate data from a binomial mixture
p_true = [0.3,0.5]
w_true = 0.5
N = 30
size = 200

B = np.random.binomial(N,np.where(np.random.rand(size)<w_true,*p_true)) 

# define the model:
# beta prior for the success probabilities
def lnprior_p(p):
    return np.sum(stats.beta.logpdf(p,a=2,b=2))

# beta prior for the unconditional cluster probability
def lnprior_w(w):
    return stats.beta.logpdf(w,a=1.2,b=1.2)

# likelihood of the data
def lnlike(theta,N,data):
    p1,p2,w = theta

    l1 = np.log(w) + stats.binom.logpmf(data,N,p1)
    l2 = np.log(1-w) + stats.binom.logpmf(data,N,p2)

    ll = np.sum(np.logaddexp(l1, l2))

    return ll

# posterior probability up to a constant
def fullmodel(theta,N,data):
    p1,p2,w = theta
    bounds = [0,1]
    if p1>=p2 or not all(bounds[0]<v<bounds[1] for v in theta):
        post = -np.inf
    else:
        post = lnlike(theta,N,data) + lnprior_p((p1,p2)) + lnprior_w(w)            

    return post

# Initialize the model 
ndim, nwalkers = 3, 8
start = [np.r_[np.random.uniform(0,.5),np.random.uniform(.5,1),np.random.rand(1)] \
               for k in range(nwalkers)]

# Set up the sampler.
sampler = emcee.EnsembleSampler(nwalkers, ndim, fullmodel,args=(N,B))

# Run a burn-in chain and save the last location.
pos, _, _ = sampler.run_mcmc(start, 5000)

# Run the chain.
sampler.reset()
sampler.run_mcmc(pos, 5000);

# plot results
labels = ["$p1$", "$p2$", "$w$"]
corner.corner(sampler.flatchain, bins=35, labels=labels,truths=[*p_true,w_true]);

如何更改以上代码以获得聚类分配的后绘制？我以为这应该骗人

# adding a prior for the cluster assignment
def lnprior_z(z,w):
    return np.sum(stats.binom.logpmf(z,n=1,p=1-w)) # z=1 is the second cluster, hence p=1-w

# change likelihood to condition on the cluster assignment
def lnlike(theta,z,N,data):
    p1,p2,w = theta

    ll = np.sum(stats.binom.logpmf(data,N,np.where(z==0,p1,p2)))

    return ll

# adapt the posterior prob and introduce the cluster assignments as additional paramters to be estimated 
def fullmodel(theta,N,data):
    p1,p2,w,z = theta[0],theta[1],theta[2],theta[3:]
    bounds = [0,1]
    if p1>=p2 or not all(bounds[0]<v<bounds[1] for v in theta[:3]):
        post = -np.inf
    else:
        post = lnlike(theta[:3],z,N,data) + lnprior_z(z,w) + lnprior_p((p1,p2)) + lnprior_w(w)            

    return post

# and give starting values for the cluster assignments
w0 = np.random.rand(nwalkers)
start = [np.r_[np.random.uniform(0,.5),np.random.uniform(.5,1),w0[k], \
         np.random.binomial(n=1,p=1-w0[k],size=size)] for k in range(nwalkers)]

但是，如果我这样做，我会估计3+size个参数，而emcee不会让我，除非我将步行者的数量增加到参数数量的两倍，但我不明白为什么。该代码运行了一段时间，并且输出只是胡说八道（如果我想绘制结果，它说：WARNING:root:Too few points to create valid contours）。我究竟做错了什么？（此post描述了如何使用pymc3中的群集分配来估计正常混合，但是我想使用emcee，部分是出于比较的原因，部分是因为我想扩展模型并使用{{ 3}}）。

使用司仪的混合模型估计

0 个答案: