Question

我目前正在关注贝叶斯（贝叶斯统计（http://www.greenteapress.com/thinkbayes/thinkbayes.pdf）的介绍性文本Think Bayes，并试图复制第10章。在本章中，将使用高度数据集来为人口的贝叶斯模型生成贝叶斯模型。给定特定样本的均值和标准差。

该想法是通过多种方法使纯计算方法更加健壮，包括近似贝叶斯计算，对数似然性，以及通过使用已知的采样规则（例如S = sigma）限制用于结果分布的先验mu和sigma的范围/ sqrt（n））。

我的代码似乎应该这样做，但是它给出的答案显然是错误的：

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

def MakeRange(estimate, stderr, num_stderrs,num_points=100):
    spread = stderr * num_stderrs
    array = np.linspace(estimate-spread,estimate+spread,num_points)
    return array

def FindPriorRanges(xs, num_stderrs=3.0):
    # compute m and s
    n = len(xs)
    m = np.mean(xs)
    s = np.std(xs)
    # compute ranges for m and s
    stderr_m = s / np.sqrt(n)
    mus = MakeRange(m, stderr_m, num_stderrs)
    stderr_s = s / np.sqrt(2 * (n-1))
    sigmas = MakeRange(s, stderr_s, num_stderrs)
    return mus, sigmas

def Likelihood(x,mus,sigmas):

    like=np.nan_to_num(np.fromfunction(lambda x,a,b:stats.norm.pdf(x,mus[a],sigmas[b]), (len(x),len(mus),len(sigmas)),dtype=np.int))#stats.norm.pdf(x,mu,sigma)
    return like

def logLikelyhood(x,mu,sigma):
    like=Likelihood(x, mu, sigma)
    largest=np.max(like)
    return np.nan_to_num(np.log(like/largest))
    #his code also has something to remove 0 probability elements to prevent -Inf when you apply log

def expLogLikelyhood(logdist):
    m=np.max(logdist)
    return np.exp(logdist-m)


#note that this is extremely slow, and was made to ensure the algorithm is working exactly as expected
def approxBayesianComputation(data,mus,sigmas):
    n=len(data)
    m=np.mean(data)
    s=np.std(data)
    #he adds together the likelyhoods (log) of the mean and the sigma occuring given the particular dataset (rather than the exact data we got here (in terms of numbers))

    #would be two seperate apply along axes
    #muloglike=np.apply_along_axis(func1d, 0, arr)

    #very slow

    #using log likelyhood (same result as below)
#     arr=np.ones([len(mus),len(sigmas)])
#     for muind,mu in enumerate(mus):
#         for sigmaind,sigma in enumerate(sigmas):
#             stderr_m = sigma / np.sqrt(n)
#             loglike=np.log(stats.norm.pdf(m, mu, stderr_m))
#             stderr_s=sigma/np.sqrt(2 * (n-1))
#             loglike+=np.log(stats.norm.pdf(s, sigma, stderr_s))
#             arr[muind][sigmaind]=loglike
# 
#     return np.exp(arr)    

    #without log likelyhood
    arr=np.ones([len(mus),len(sigmas)])
    for muind,mu in enumerate(mus):
        for sigmaind,sigma in enumerate(sigmas):
            stderr_m = sigma / np.sqrt(n)
            loglike=stats.norm.pdf(m, mu, stderr_m)
            stderr_s=sigma/np.sqrt(2 * (n-1))
            loglike*=stats.norm.pdf(s, sigma, stderr_s)
            arr[muind][sigmaind]=loglike

    return arr      


def Normalize(arr):
    return arr/np.sum(arr)

#data=[1,2,3,4,5,6,7,8,2,3,4,5,6,7,3,3,4,4,1,1,1,1,1,1,1,1,1,1,1,1,12,2]*70
#data=[1,4,2,5,6,7,8,-3,-4,-5,-1]
#data=[1,1,1,1,1,1,2,2,2,0,0,0]
data=[1,2,3,4,6]
#data=[-2,0,2,3,4,5]
#data=[7,7,7,7,7,7,7,7,7,7,7,7,7.1]
#data=[0,-1,1,-2,2,-3,3,-4,4,-5,5,-6,6]
#data=[0,0,0,0]
#data=[5,5,5,5]

#all likely ranges for mu and sigma 
mus, sigmas=FindPriorRanges(data)

#sigmas=np.linspace(.01,10,100)
#mus=np.linspace(0,30,1000)

prior=np.ones([len(mus),len(sigmas)])

#make 2d distribution

multarray=Likelihood(data,mus,sigmas)


posterior=Normalize(np.multiply(prior,np.prod(multarray,axis=0)))

plt.imshow(posterior, cmap='gist_heat', interpolation='nearest',extent=[min(sigmas),max(sigmas),min(mus),max(mus)])#extent=[min(mus),max(mus),min(sigmas),max(sigmas)]
plt.show()


#using log likelyhoods for larger datasets (note that this works on large datasets where the above fails due to discretization error

multarray=logLikelyhood(data,mus,sigmas)
#the product of logs is the log of the sum
posterior=Normalize(np.multiply(prior,expLogLikelyhood(np.sum(multarray,axis=0))))

plt.imshow(posterior, cmap='gist_heat', interpolation='nearest',extent=[min(sigmas),max(sigmas),min(mus),max(mus)])
plt.show()


plt.imshow(approxBayesianComputation(data, mus, sigmas), cmap='gist_heat', interpolation='nearest',extent=[min(sigmas),max(sigmas),min(mus),max(mus)])
plt.show()

该程序的作用是根据特定的数据样本（代码中的命名数据）计算mus范围，均值和sigma，标准差。当前，默认情况下，它会为每个参数生成100个点。

np.fromfunction用于使用scipy的常规pdf函数（作为点函数）传播具有维度(len(data),len(mus),len(sigmas))的数组。然后沿数据轴执行乘法运算以更新分布，然后将其乘以先验值，我将其作为均匀分布。对于logLikelyhood，该过程几乎相同，除了它沿数据轴相加（对数之和为乘积的对数），通过取幂将对数转换回标准概率，然后乘以先验。

但是，当我运行代码时，我注意到一些奇怪的事情：

像我相信的那样，分布不以mu和sigma的可能范围的中心为中心
该分布似乎对mu具有正确的偏斜，并且随着mu的增加，σ的“彗尾”也不断扩展。通常，您会期望它完全是椭圆形的。
分布的最大mu和sigma不是数据的mu和sigma，这肯定是错误的。
对于某些已注释掉的数据分布，数据似乎甚至看起来都不像前面提到的“彗星”或椭圆形。

可能性和对数似然性始终产生相同的分布。但是，我得到一些除以0的错误，对于某些具有对数似然性的分布，我得到了nan的值，这些值已经用np.nan_to_num()进行了平滑处理。但是，即使对于没有这些错误的发行版（似乎也不会显着影响结果）。另外，approxBayesianComputation设计用于透明地处理正在发生的事情（它使用python for循环缓慢地将所有内容相乘），其答案与快速的numpy实现类似。

均值和标准差（2D）的贝叶斯估计

0 个答案: