PyMC3如何实现潜在的dirichlet分配?

时间:2015-07-17 10:13:03

标签: lda pymc3

我正在尝试使用PyMC3实现lda。

但是,在定义模型的最后部分时,根据主题对单词进行采样,我不断收到错误:TypeError:list indices必须是整数,而不是TensorVariable

如何解决这个问题?

代码如下:

## Data Preparation

K = 2 # number of topics
N = 4 # number of words
D = 3 # number of documents

import numpy as np

data = np.array([[1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0]])
Wd = [len(doc) for doc in data]  # length of each document

## Model Specification

from pymc3 import Model, Normal, HalfNormal, Dirichlet, Categorical, constant

lda_model = Model()

with lda_model:

    # Priors for unknown model parameters
    alpha = HalfNormal('alpha', sd=1)
    eta = HalfNormal('eta', sd=1)

    a1 = eta*np.ones(shape=N)
    a2 = alpha*np.ones(shape=K)

    beta = [Dirichlet('beta_%i' % i, a1, shape=N) for i in range(K)]
    theta = [Dirichlet('theta_%s' % i, a2, shape=K) for i in range(D)]

    z = [Categorical('z_%i' % d, p = theta[d], shape=Wd[d]) for d in range(D)]

    # That's when you get the error. It is caused by: beta[z[d][w]]
    w = [Categorical('w_%i_%i' % (d, w), p = beta[z[d][w]], observed = data[i,j]) for d in range(D) for w in range(Wd[d])]

非常感谢任何帮助!

3 个答案:

答案 0 :(得分:1)

beta[z[d][w]]自然不正确,因为z[d][w]是PyMC存储的变量,而不是固定索引。

在pymc2中,它由lambda函数

解决
p=pm.Lambda("phi_z_%s_%s" % (d,i), 
             lambda z=z[d][w], beta=beta: beta[z])

在pymc3中,它可以通过

来解决
@theano.compile.ops.as_op
def your_function

但是这里有一个问题,似乎Theano不允许发送pymc变量的python列表。 t.lvector baisically不工作。

此问题中有更多讨论: Unable to create lambda function in hierarchical pymc3 model

答案 1 :(得分:0)

结帐this blog post。我还没有测试过。

 import numpy as np  
 import pymc as pc  


 def wordDict(collection):  
  word_id  = {}  
  idCounter = 0  
  for d in collection:  
    for w in d:  
      if (w not in word_id):  
        word_id[w] = idCounter  
        idCounter+=1  
  return word_id  

 def toNpArray(word_id, collection):  
  ds = []  
  for d in collection:  
    ws = []  
    for w in d:  
      ws.append(word_id.get(w,0))  
    ds.append(ws)  
  return np.array(ds)  

 ###################################################  

 #doc1, doc2, ..., doc7  
 docs = [["sepak","bola","sepak","bola","bola","bola","sepak"],  
         ["uang","ekonomi","uang","uang","uang","ekonomi","ekonomi"],  
         ["sepak","bola","sepak","bola","sepak","sepak"],  
         ["ekonomi","ekonomi","uang","uang"],  
         ["sepak","uang","ekonomi"],  
         ["komputer","komputer","teknologi","teknologi","komputer","teknologi"],  
         ["teknologi","komputer","teknologi"]]  

 word_dict = wordDict(docs)  
 collection = toNpArray(word_dict,docs)  

 #number of topics  
 K = 3  

 #number of words (vocab)  
 V = len(word_dict)  

 #number of documents  
 D = len(collection)  

 #array([1, 1, 1, ..., 1]) K times  
 alpha = np.ones(K)  

 #array([1, 1, 1, ..., 1]) V times  
 beta = np.ones(V)  

 #array containing the information about doc length in our collection
 Nd = [len(doc) for doc in collection]  


 ######################## LDA model ##################################  

 #topic distribution per-document  
 theta = pc.Container([pc.CompletedDirichlet("theta_%s" % i,   
                                             pc.Dirichlet("ptheta_%s"%i, theta=alpha))  
                      for i in range(D)])  

 #word distribution per-topic  
 phi = pc.Container([pc.CompletedDirichlet("phi_%s" % j,   
                                           pc.Dirichlet("pphi_%s" % j, theta=beta))  
                     for j in range(K)])  


 #Please note that this is the tricky part :)  
 z = pc.Container([pc.Categorical("z_%i" % d,  
                                  p = theta[d],  
                                  size = Nd[d],  
                                  value = np.random.randint(K, size=Nd[d]))   
                   for d in range(D)])  

 #word generated from phi, given a topic z  
 w = pc.Container([pc.Categorical("w_%i_%i" % (d,i),  
                                  p = pc.Lambda("phi_z_%i_%i" % (d,i),  
                                                lambda z=z[d][i], phi=phi : phi[z]),
                                  value=collection[d][i],  
                                  observed=True)  
                   for d in range(D) for i in range(Nd[d])])  

 ####################################################################  

 model = pc.Model([theta, phi, z, w])  
 mcmc = pc.MCMC(model)  
 mcmc.sample(iter=5000, burn=1000)  


 #show the topic assignment for each word, using the last trace  
 for d in range(D):  
    print(mcmc.trace('z_%i'%d)[3999])  

答案 2 :(得分:0)

以下代码改编自@Hanan引用的内容。我以某种方式使它与pymc3一起工作。

import numpy as np
import pymc3 as pm

def get_word_dict(collection):
    vocab_list = list({word for doc in collection for word in doc})
    idx_list = [i for i in range(len(vocab_list))]
    return dict(zip(vocab_list,idx_list))

def word_to_idx(dict_vocab_idx, collection):
    return [[dict_vocab_idx[word] for word in doc] for doc in collection]

docs = [["sepak","bola","sepak","bola","bola","bola","sepak"],  
         ["uang","ekonomi","uang","uang","uang","ekonomi","ekonomi"],  
         ["sepak","bola","sepak","bola","sepak","sepak"],  
         ["ekonomi","ekonomi","uang","uang"],  
         ["sepak","uang","ekonomi"],  
         ["komputer","komputer","teknologi","teknologi","komputer","teknologi"],  
         ["teknologi","komputer","teknologi"]]  

dict_vocab_idx = get_word_dict(docs)
idxed_collection = word_to_idx(dict_vocab_idx, docs)

n_topics = 3
n_vocab = len(dict_vocab_idx)
n_docs = len(idxed_collection)
length_docs = [len(doc) for doc in idxed_collection]

alpha = np.ones([n_docs, n_topics])
beta = np.ones([n_topics, n_vocab])

with pm.Model() as model:
    theta = pm.distributions.Dirichlet('theta', a=alpha, shape=(n_docs, n_topics))
    phi = pm.distributions.Dirichlet('phi', a=beta, shape=(n_topics, n_vocab))
    zs = [pm.Categorical("z_d{}".format(d), p=theta[d], shape=length_docs[d]) for d in range(n_docs)]
    ws = [pm.Categorical("w_{}_{}".format(d,i), p=phi[zs[d][i]], observed=idxed_collection[d][i]) 
    for d in range(n_docs) for i in range(length_docs[d])]
    trace = pm.sample(2000)

for d in range(n_docs):
    value_z=trace.get_values("z_d{}".format(d))
    print(value_z[1999])