我正在尝试将pymc3与从某些观察到的数据派生的似然函数一起使用。这些观察到的数据不适合任何标准的标准分布,因此我想根据这些观察来定义自己的数据。
一种方法是对观察值使用核密度估计。这在pymc2中是可能的,但不能与pymc3中的Theano变量很好地配合。
在下面的代码中,我只是生成一些正态分布的伪数据。和我以前一样,我基本上是为观察结果分配一个均匀的分布。
这是我的代码:
from scipy import stats
import numpy as np
import pymc3 as pm
from sklearn.neighbors.kde import KernelDensity
data = np.sort(stats.norm.rvs(loc=0, scale=1, size=1000))
kde = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(data.reshape(-1, 1))
def get_log_likelihood(x):
return kde.score_samples(x)
with pm.Model() as test_model:
x = pm.Uniform('prior rv', lower=-10, upper=10)
obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
step = pm.Metropolis()
trace = pm.sample(200, step=step)
我收到的错误似乎是kde score_samples
函数爆炸,因为它期望一个数组,但是x
是Theano变量。
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-49-4efbbe7376dc> in <module>()
1 with pm.Model() as test_model:
2 x = pm.Uniform('prior rv', lower=0.0, upper=1e6)
----> 3 obs = pm.DensityDist('observed likelihood', get_log_likelihood, observed={'x': x})
4
5 step = pm.Metropolis()
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/distributions/distribution.py in __new__(cls, name, *args, **kwargs)
40 total_size = kwargs.pop('total_size', None)
41 dist = cls.dist(*args, **kwargs)
---> 42 return model.Var(name, dist, data, total_size)
43 else:
44 raise TypeError("Name needs to be a string but got: {}".format(name))
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in Var(self, name, dist, data, total_size)
825 with self:
826 var = MultiObservedRV(name=name, data=data, distribution=dist,
--> 827 total_size=total_size, model=self)
828 self.observed_RVs.append(var)
829 if var.missing_values:
~/research_notebooks/venv/lib/python3.6/site-packages/pymc3/model.py in __init__(self, name, data, distribution, total_size, model)
1372 self.missing_values = [datum.missing_values for datum in self.data.values()
1373 if datum.missing_values is not None]
-> 1374 self.logp_elemwiset = distribution.logp(**self.data)
1375 # The logp might need scaling in minibatches.
1376 # This is done in `Factor`.
<ipython-input-48-535f58ce543b> in get_log_likelihood(x)
1 def get_log_likelihood(x):
----> 2 return kde.score_samples(x)
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
150 # For it to be a probability, we must scale it. For this reason
151 # we'll also scale atol.
--> 152 X = check_array(X, order='C', dtype=DTYPE)
153 N = self.tree_.data.shape[0]
154 atol_N = self.atol * N
~/research_notebooks/venv/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
431 force_all_finite)
432 else:
--> 433 array = np.array(array, dtype=dtype, order=order, copy=copy)
434
435 if ensure_2d:
ValueError: setting an array element with a sequence.
任何帮助将不胜感激。谢谢!