假设我有几个直方图,每个直方图都有不同 bin位置(在实轴上)的计数。 e.g。
def generate_random_histogram():
# Random bin locations between 0 and 100
bin_locations = np.random.rand(10,) * 100
bin_locations.sort()
# Random counts between 0 and 50 on those locations
bin_counts = np.random.randint(50, size=len(bin_locations))
return {'loc': bin_locations, 'count':bin_counts}
# We can assume that the bin size is either pre-defined or that
# the bin edges are on the middle-point between consecutive counts.
hists = [generate_random_histogram() for x in xrange(3)]
如何对这些直方图进行标准化,以便我得到 PDF ,其中每个 PDF 的积分在给定范围内(例如0和100)加一?
我们可以假设直方图计算预定义的bin大小上的事件(例如10)
我见过的大多数实现都基于,例如,从数据开始的高斯内核(请参阅scipy
和scikit-learn
)。就我而言,我需要从直方图中做到这一点,因为我无法访问原始数据。
请注意,所有当前答案都假设我们正在查看(-Inf,+ Inf)中的随机变量。这可以作为粗略的近似,但根据应用程序可能并非如此,其中变量可以在其他范围[a,b]
内定义(例如,在上述情况下为0和100)
答案 0 :(得分:6)
精致的要点是定义bin_edges
,因为从技术上讲它们可以在任何地方。我选择了每对bin中心之间的中点。可能还有其他方法可以做到这一点,但这里有一个:
hists = [generate_random_histogram() for x in xrange(3)]
for h in hists:
bin_locations = h['loc']
bin_counts = h['count']
bin_edges = np.concatenate([[0], (bin_locations[1:] + bin_locations[:-1])/2, [100]])
bin_widths = np.diff(bin_edges)
bin_density = bin_counts.astype(float) / np.dot(bin_widths, bin_counts)
h['density'] = bin_density
data = np.repeat(bin_locations, bin_counts)
h['kde'] = gaussian_kde(data)
plt.step(bin_locations, bin_density, where='mid', label='normalized')
plt.plot(np.linspace(0,100), h['kde'](np.linspace(0,100)), label='kde')
这会产生如下图(每个直方图一个):
答案 1 :(得分:3)
这是一种可能性。我并不是那么疯狂,但它有点有效。请注意,直方图有点奇怪,因为bin宽度变化很大。
import numpy as np
from matplotlib import pyplot
from sklearn.mixture.gmm import GMM
from sklearn.grid_search import GridSearchCV
def generate_random_histogram():
# Random bin locations between 0 and 100
bin_locations = np.random.rand(10,) * 100
bin_locations.sort()
# Random counts on those locations
bin_counts = np.random.randint(50, size=len(bin_locations))
return {'loc': bin_locations, 'count':bin_counts}
def bin_widths(loc):
widths = []
for i in range(len(loc)-1):
widths.append(loc[i+1] - loc[i])
widths.append(widths[-1])
widths = np.array(widths)
return widths
def sample_from_hist(loc, count, size):
n = len(loc)
tot = np.sum(count)
widths = bin_widths(loc)
lowers = loc - widths
uppers = loc + widths
probs = count / float(tot)
bins = np.argmax(np.random.multinomial(n=1, pvals=probs, size=(size,)),1)
return np.random.uniform(lowers[bins], uppers[bins])
# Generate the histogram
hist = generate_random_histogram()
# Sample from the histogram
sample = sample_from_hist(hist['loc'],hist['count'],np.sum(hist['count']))
# Fit a GMM
param_grid = [{'n_components':[1,2,3,4,5]}]
def scorer(est, X, y=None):
return np.sum(est.score(X))
grid_search = GridSearchCV(GMM(), param_grid, scoring=scorer).fit(np.reshape(sample,(len(sample),1)))
gmm = grid_search.best_estimator_
# Sample from the GMM
gmm_sample = gmm.sample(np.sum(hist['count']))
# Plot the resulting histograms and mixture distribution
fig = pyplot.figure()
ax1 = fig.add_subplot(311)
widths = bin_widths(hist['loc'])
ax1.bar(hist['loc'], hist['count'], width=widths)
ax2 = fig.add_subplot(312, sharex=ax1)
ax2.hist(gmm_sample, bins=hist['loc'])
x = np.arange(min(sample), max(sample), .1)
y = np.exp(gmm.score(x))
ax3 = fig.add_subplot(313, sharex=ax1)
ax3.plot(x, y)
pyplot.show()
答案 2 :(得分:2)
这是用pymc做的一种方法。此方法在混合模型中使用固定数量的组件(n_components)。您可以尝试在n_components之前附加一个n_components并对其进行采样。或者,你可以选择合理的东西,或者从我的其他答案中使用网格搜索技术来估算组件的数量。在下面的代码中,我使用10000次迭代,烧录次数为9000次,但这可能不足以获得良好的结果。我建议使用更大的烧伤。我也选择了一些任意的先验,所以这些可能是值得关注的。你必须试验它。祝你好运。这是代码。
import numpy as np
import pymc as mc
import scipy.stats as stats
from matplotlib import pyplot
def generate_random_histogram():
# Random bin locations between 0 and 100
bin_locations = np.random.rand(10,) * 100
bin_locations.sort()
# Random counts on those locations
bin_counts = np.random.randint(50, size=len(bin_locations))
return {'loc': bin_locations, 'count':bin_counts}
def bin_widths(loc):
widths = []
for i in range(len(loc)-1):
widths.append(loc[i+1] - loc[i])
widths.append(widths[-1])
widths = np.array(widths)
return widths
def mixer(name, weights, value=None):
n = value.shape[0]
def logp(value, weights):
vals = np.zeros(shape=(n, weights.shape[1]), dtype=int)
vals[:, value.astype(int)] = 1
return mc.multinomial_like(x = vals, n=1, p=weights)
def random(weights):
return np.argmax(np.random.multinomial(n=1, pvals=weights[0,:], size=n), 1)
result = mc.Stochastic(logp = logp,
doc = 'A kernel smoothing density node.',
name = name,
parents = {'weights': weights},
random = random,
trace = True,
value = value,
dtype = int,
observed = False,
cache_depth = 2,
plot = False,
verbose = 0)
return result
def create_model(lowers, uppers, count, n_components):
n = np.sum(count)
lower = min(lowers)
upper = min(uppers)
locations = mc.Uniform(name='locations', lower=lower, upper=upper, value=np.random.uniform(lower, upper, size=n_components), observed=False)
precisions = mc.Gamma(name='precisions', alpha=1, beta=1, value=.001*np.ones(n_components), observed=False)
weights = mc.CompletedDirichlet('weights', mc.Dirichlet(name='weights_ind', theta=np.ones(n_components)))
choice = mixer('choice', weights, value=np.ones(n).astype(int))
@mc.observed
def histogram_data(value=count, locations=locations, precisions=precisions, weights=weights):
if hasattr(weights, 'value'):
weights = weights.value
lower_cdfs = sum([weights[0,i]*stats.norm.cdf(lowers, loc=locations[i], scale=np.sqrt(1.0/precisions[i])) for i in range(len(weights))])
upper_cdfs = sum([weights[0,i]*stats.norm.cdf(uppers, loc=locations[i], scale=np.sqrt(1.0/precisions[i])) for i in range(len(weights))])
bin_probs = upper_cdfs - lower_cdfs
bin_probs = np.array(list(upper_cdfs - lower_cdfs) + [1.0 - np.sum(bin_probs)])
n = np.sum(count)
return mc.multinomial_like(x=np.array(list(count) + [0]), n=n, p=bin_probs)
@mc.deterministic
def location(locations=locations, choice=choice):
return locations[choice.astype(int)]
@mc.deterministic
def dispersion(precisions=precisions, choice=choice):
return precisions[choice.astype(int)]
data_generator = mc.Normal('data', mu=location, tau=dispersion)
return locals()
# Generate the histogram
hist = generate_random_histogram()
loc = hist['loc']
count = hist['count']
widths = bin_widths(hist['loc'])
lowers = loc - widths
uppers = loc + widths
# Create the model
model = create_model(lowers, uppers, count, 5)
# Initialize to the MAP estimate
model = mc.MAP(model)
model.fit(method ='fmin')
# Now sample with MCMC
model = mc.MCMC(model)
model.sample(iter=10000, burn=9000, thin=300)
# Plot the mu and tau traces
mc.Matplot.plot(model.trace('locations'))
pyplot.show()
# Get the samples from the fitted pdf
sample = np.ravel(model.trace('data')[:])
# Plot the original histogram, sampled histogram, and pdf
lower = min(lowers)
upper = min(uppers)
kde = stats.gaussian_kde(sample)
x = np.arange(0,100,.1)
y = kde(x)
fig = pyplot.figure()
ax1 = fig.add_subplot(311)
pyplot.xlim(lower,upper)
ax1.bar(loc, count, width=widths)
ax2 = fig.add_subplot(312, sharex=ax1)
ax2.hist(sample, bins=loc)
ax3 = fig.add_subplot(313, sharex=ax1)
ax3.plot(x, y)
pyplot.show()
正如您所看到的,这两个发行版看起来并不完全相同。然而,直方图并没有多大的意义。我会玩不同数量的组件和更多的迭代/刻录,但它是一个项目。根据您的优先级,我怀疑@ askewchan或我的其他答案可能会为您提供更好的服务。