散景图不用3k点缩放或套索(散点图)

时间:2018-06-13 21:24:41

标签: python performance pandas bokeh

我有以下功能用于在jupyter笔记本中创建散景图并选择点,然后绘制所选点的特征平均值与剩余特征的平均值的条形图。传递给函数的数据帧具有由许多不同的降维技术创建的坐标(这是x和y来自的地方)。

使用几百行时效果很好,但是当我做3k时它渲染速度相对较快,但不会缩放或套索。知道如何改进这个,还是仅仅是Bokeh的产品?

  def graph_interactive(df_,algorithm,metric,local_host):


        TOOLS="pan,wheel_zoom,hover,box_select,lasso_select,reset"    

        def app(doc):
            bokeh_df = df_.copy()
            x = bokeh_df['x-{}'.format(str(algorithm)[:3])]
            y = bokeh_df['y-{}'.format(str(algorithm)[:3])]
            # create the scatter plot
            p = figure(tools=TOOLS, plot_width=500, plot_height=500, min_border=10, min_border_left=50,
                       toolbar_location="above", x_axis_location=None, y_axis_location=None,
                       title="Linked Histograms")
            p.select(BoxSelectTool).select_every_mousemove = False
            p.select(LassoSelectTool).select_every_mousemove = False


            if metric == 'LengthOfStayDaysNBR':
                color_mapper = linear_cmap(metric,palette=Spectral6,low=bokeh_df[metric].min(),high=bokeh_df[metric].max())
            else:
                bokeh_df[metric+'_str'] = np.where(bokeh_df[metric] == 1, '1', '0')
                if bokeh_df[metric].unique()[0] == 0:
                    colors = ['white','red']
                else:
                    colors = ['red','white']
                color_mapper = factor_cmap(metric+'_str',palette=colors,factors=bokeh_df[metric+'_str'].unique())




            r = p.scatter(source = bokeh_df,x = 'x-{}'.format(str(algorithm)[:3])\
                          ,y = 'y-{}'.format(str(algorithm)[:3])\
                          , fill_color = color_mapper\
                          , alpha=0.9)
            bokeh_df = bokeh_df[[col for col in bokeh_df.columns if ('x-' not in col) \
                                 and ('y-' not in col) and ('_str' not in col)]]

        # fill_color = color_mapper
            # create the horizontal histogram
            hhist, hedges = np.histogram(x, bins=20)
            hzeros = np.zeros(len(hedges)-1)
            hmax = max(hhist)*1.1

            LINE_ARGS = dict(color="#3A5785", line_color=None)

            ph = figure(toolbar_location=None, plot_width=p.plot_width, plot_height=100, x_range=p.x_range,
                        y_range=(-hmax, hmax), min_border=10, min_border_left=50, y_axis_location="right")
            ph.xgrid.grid_line_color = None
            ph.yaxis.major_label_orientation = np.pi/4
            ph.background_fill_color = "#fafafa"

            ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hhist, color="white", line_color="#3A5785")
            hh1 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.5, **LINE_ARGS)
            hh2 = ph.quad(bottom=0, left=hedges[:-1], right=hedges[1:], top=hzeros, alpha=0.1, **LINE_ARGS)

            # create the vertical histogram
            vhist, vedges = np.histogram(y, bins=20)
            vzeros = np.zeros(len(vedges)-1)
            vmax = max(vhist)*1.1

            pv = figure(toolbar_location=None, plot_width=100, plot_height=p.plot_height, x_range=(-vmax, vmax),
                        y_range=p.y_range, min_border=10, y_axis_location="right")
            pv.ygrid.grid_line_color = None
            pv.xaxis.major_label_orientation = np.pi/4
            pv.background_fill_color = "#fafafa"

            pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vhist, color="white", line_color="#3A5785")
            vh1 = pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.5, **LINE_ARGS)
            vh2 = pv.quad(left=0, bottom=vedges[:-1], top=vedges[1:], right=vzeros, alpha=0.1, **LINE_ARGS)

            layout = column(row(p, pv), row(ph, Spacer(width=200, height=200)))

            doc.add_root(layout)
            doc.title = "Selection Histogram"

            def update(attr, old, new):
                inds = np.array(new['1d']['indices'])
                if len(inds) == 0 or len(inds) == len(x):
                    hhist1, hhist2 = hzeros, hzeros
                    vhist1, vhist2 = vzeros, vzeros
                else:
                    neg_inds = np.ones_like(x, dtype=np.bool)
                    neg_inds[inds] = False
                    hhist1, _ = np.histogram(x[inds], bins=hedges)
                    vhist1, _ = np.histogram(y[inds], bins=vedges)
                    hhist2, _ = np.histogram(x[neg_inds], bins=hedges)
                    vhist2, _ = np.histogram(y[neg_inds], bins=vedges)

                hh1.data_source.data["top"]   =  hhist1
                hh2.data_source.data["top"]   = -hhist2
                vh1.data_source.data["right"] =  vhist1
                vh2.data_source.data["right"] = -vhist2

                # Calculate means of features for selected data 
                df = bokeh_df.loc[bokeh_df.index.isin(inds)]
                selected_mean = df[metric].mean()
                # get top 25
                df = df.drop(expected_metrics+metrics+['AgeNBR'],axis=1).mean()\
                    .sort_values(ascending=False)[:25]
                # sort for graph
                df = df.sort_values() 


                # Create Figure
                fig, axs = plt.subplots(2,1,figsize=(20,15),facecolor='w',edgecolor='k')
                axs = axs.ravel()

                # Create first graph of bars of features
                axs[0] = plt.subplot(2,1,1)
                df.plot(kind='barh',width=.4,position=1,color='red')
                # Second bars of total means of those features
                full_df = bokeh_df[df.index].loc[~(bokeh_df.index.isin(inds))].mean()
                full_df.plot(kind='barh',width=.4,position=0,color='blue')\
                    .set_yticklabels([str(tick)[:45] for tick in full_df.index])
                plt.xticks(fontsize=20)
                axs[0].yaxis.tick_right()
                axs[0].yaxis.grid(color='gray', linestyle='dashed')
                axs[0].xaxis.grid(color='gray', linestyle='dashed')



                plt.yticks(fontsize=23)
                plt.gca().set_title('Selected and full feature set averages',fontsize=30)

                # Create barchart of the selected minus the full features
                axs[1] = plt.subplot(2,1,2)     
                difference = (df - full_df)/full_df 
                difference[difference < 0 ]*=10
                difference.plot(kind='barh',color = ['r' if x > 0 else 'b' for x in difference]).\
                    set_yticklabels([str(tick)[:45] for tick in difference.index])
                plt.xticks(fontsize=20)
                plt.gca().set_title('Selected minus full feature set averages divided by full', fontsize=30)
                axs[1].yaxis.tick_right()
                axs[1].yaxis.grid(color='gray', linestyle='dashed')
                axs[1].xaxis.grid(color='gray', linestyle='dashed')
                plt.yticks(fontsize=23)        
                plt.tight_layout()




                total_mean = bokeh_df[metric].mean()
                print ('Selected {0} rate: {1:.4f}\nTotal {0} rate: {2:.4f}\nSelected minus total divided by total: {3:.4f}'\
                           .format(metric,selected_mean,total_mean,(selected_mean-total_mean)/total_mean))

                plt.show()

0 个答案:

没有答案