我有以下功能用于在jupyter笔记本中创建散景图并选择点,然后绘制所选点的特征平均值与剩余特征的平均值的条形图。传递给函数的数据帧具有由许多不同的降维技术创建的坐标(这是x和y来自的地方)。
使用几百行时效果很好,但是当我做3k时它渲染速度相对较快,但不会缩放或套索。知道如何改进这个,还是仅仅是Bokeh的产品?
def graph_interactive(df_,algorithm,metric,local_host):
TOOLS="pan,wheel_zoom,hover,box_select,lasso_select,reset"
def app(doc):
bokeh_df = df_.copy()
x = bokeh_df['x-{}'.format(str(algorithm)[:3])]
y = bokeh_df['y-{}'.format(str(algorithm)[:3])]
# create the scatter plot
p = figure(tools=TOOLS, plot_width=500, plot_height=500, min_border=10, min_border_left=50,
toolbar_location="above", x_axis_location=None, y_axis_location=None,
title="Linked Histograms")
p.select(BoxSelectTool).select_every_mousemove = False
p.select(LassoSelectTool).select_every_mousemove = False
if metric == 'LengthOfStayDaysNBR':
color_mapper = linear_cmap(metric,palette=Spectral6,low=bokeh_df[metric].min(),high=bokeh_df[metric].max())
else:
bokeh_df[metric+'_str'] = np.where(bokeh_df[metric] == 1, '1', '0')
if bokeh_df[metric].unique()[0] == 0:
colors = ['white','red']
else:
colors = ['red','white']
color_mapper = factor_cmap(metric+'_str',palette=colors,factors=bokeh_df[metric+'_str'].unique())
r = p.scatter(source = bokeh_df,x = 'x-{}'.format(str(algorithm)[:3])\
,y = 'y-{}'.format(str(algorithm)[:3])\
, fill_color = color_mapper\
, alpha=0.9)
bokeh_df = bokeh_df[[col for col in bokeh_df.columns if ('x-' not in col) \
and ('y-' not in col) and ('_str' not in col)]]
# fill_color = color_mapper
# create the horizontal histogram
hhist, hedges = np.histogram(x, bins=20)
hzeros = np.zeros(len(hedges)-1)
hmax = max(hhist)*1.1
LINE_ARGS = dict(color="#3A5785", line_color=None)
ph = figure(toolbar_location=None, plot_width=p.plot_width, plot_height=100, x_range=p.x_range,
y_range=(-hmax, hmax), min_border=10, min_border_left=50, y_axis_location="right")
ph.xgrid.grid_line_color = None
ph.yaxis.major_label_orientation = np.pi/4
ph.background_fill_color = "#fafafa"
ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hhist, color="white", line_color="#3A5785")
hh1 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)
# create the vertical histogram
vhist, vedges = np.histogram(y, bins=20)
vzeros = np.zeros(len(vedges)-1)
vmax = max(vhist)*1.1
pv = figure(toolbar_location=None, plot_width=100, plot_height=p.plot_height, x_range=(-vmax, vmax),
y_range=p.y_range, min_border=10, y_axis_location="right")
pv.ygrid.grid_line_color = None
pv.xaxis.major_label_orientation = np.pi/4
pv.background_fill_color = "#fafafa"
pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vhist, color="white", line_color="#3A5785")
vh1 = pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
vh2 = pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)
layout = column(row(p, pv), row(ph, Spacer(width=200, height=200)))
doc.add_root(layout)
doc.title = "Selection Histogram"
def update(attr, old, new):
inds = np.array(new['1d']['indices'])
if len(inds) == 0 or len(inds) == len(x):
hhist1, hhist2 = hzeros, hzeros
vhist1, vhist2 = vzeros, vzeros
else:
neg_inds = np.ones_like(x, dtype=np.bool)
neg_inds[inds] = False
hhist1, _ = np.histogram(x[inds], bins=hedges)
vhist1, _ = np.histogram(y[inds], bins=vedges)
hhist2, _ = np.histogram(x[neg_inds], bins=hedges)
vhist2, _ = np.histogram(y[neg_inds], bins=vedges)
hh1.data_source.data["top"] = hhist1
hh2.data_source.data["top"] = -hhist2
vh1.data_source.data["right"] = vhist1
vh2.data_source.data["right"] = -vhist2
# Calculate means of features for selected data
df = bokeh_df.loc[bokeh_df.index.isin(inds)]
selected_mean = df[metric].mean()
# get top 25
df = df.drop(expected_metrics+metrics+['AgeNBR'],axis=1).mean()\
.sort_values(ascending=False)[:25]
# sort for graph
df = df.sort_values()
# Create Figure
fig, axs = plt.subplots(2,1,figsize=(20,15),facecolor='w',edgecolor='k')
axs = axs.ravel()
# Create first graph of bars of features
axs[0] = plt.subplot(2,1,1)
df.plot(kind='barh',width=.4,position=1,color='red')
# Second bars of total means of those features
full_df = bokeh_df[df.index].loc[~(bokeh_df.index.isin(inds))].mean()
full_df.plot(kind='barh',width=.4,position=0,color='blue')\
.set_yticklabels([str(tick)[:45] for tick in full_df.index])
plt.xticks(fontsize=20)
axs[0].yaxis.tick_right()
axs[0].yaxis.grid(color='gray', linestyle='dashed')
axs[0].xaxis.grid(color='gray', linestyle='dashed')
plt.yticks(fontsize=23)
plt.gca().set_title('Selected and full feature set averages',fontsize=30)
# Create barchart of the selected minus the full features
axs[1] = plt.subplot(2,1,2)
difference = (df - full_df)/full_df
difference[difference < 0 ]*=10
difference.plot(kind='barh',color = ['r' if x > 0 else 'b' for x in difference]).\
set_yticklabels([str(tick)[:45] for tick in difference.index])
plt.xticks(fontsize=20)
plt.gca().set_title('Selected minus full feature set averages divided by full', fontsize=30)
axs[1].yaxis.tick_right()
axs[1].yaxis.grid(color='gray', linestyle='dashed')
axs[1].xaxis.grid(color='gray', linestyle='dashed')
plt.yticks(fontsize=23)
plt.tight_layout()
total_mean = bokeh_df[metric].mean()
print ('Selected {0} rate: {1:.4f}\nTotal {0} rate: {2:.4f}\nSelected minus total divided by total: {3:.4f}'\
.format(metric,selected_mean,total_mean,(selected_mean-total_mean)/total_mean))
plt.show()