我有下面的示例数据的代码和数据。在代码中,我创建了3个函数。第一个通过分类变量列表绘制连续变量的分布。第二个为数据中的时间段列表创建列。然后,第三个将它们组合起来,以一列时间周期列的形式绘制连续变量的分布图。我已经分别测试了前两个功能,它们按预期工作,但是当我尝试运行第三个功能时,出现以下关键错误。如果有人能看到问题所在或提出解决方法的建议,我将不胜感激。
样本数据:
Unnamed: 0 call_history_id calllog_id \
0 16358 1210746736 ca58d850-6fe6-4673-a049-ea4a2d8d7ecf
1 16361 1210976828 c005329b-955d-4d88-98a5-1c47e6a1cb80
2 16402 1217791595 050e9b83-54c2-4c87-abdd-32225c0d3189
3 16471 1228495414 45705ed1-a8e2-4a15-8941-5b0a40b7d409
4 27906 1245173592 04e56818-04a0-4704-ac86-31c31dac2370
call_id connection_id pbx_name pbx_id extension_number \
0 1.509170e+12 1.509170e+12 sales8x8 sales8x8 595
1 1.509170e+12 1.509170e+12 sales8x8 sales8x8 595
2 1.509170e+12 1.509170e+12 sales8x8 sales8x8 595
3 1.509170e+12 1.509170e+12 sales8x8 sales8x8 595
4 1.509170e+12 1.509170e+12 sales8x8 sales8x8 595
extension_id customer_id address name \
0 595 2.525100e+29 14086694428 Sun Basket
1 595 2.525100e+29 13214371589 PEREZ,BRYAN
2 595 2.525100e+29 14088566290 14088566290
3 595 2.525100e+29 8059316676 Dialing
4 595 2.525100e+29 12028071151 Implementation Team
start_timestamp direction call_internal call_missed duration \
0 1/8/18 19:49 I 0.0 0.0 4414.0
1 1/8/18 20:09 I 0.0 0.0 8300.0
2 1/9/18 20:31 I 0.0 0.0 14766.0
3 1/11/18 17:16 I 0.0 0.0 1686.0
4 1/15/18 22:55 I 0.0 0.0 3491.0
device_model group_call group_name group_number device_id \
0 mediaserver 0.0 N N MasterSlaveService
1 mediaserver 0.0 N N MasterSlaveService
2 mediaserver 0.0 N N MasterSlaveService
3 mediaserver 0.0 N N MasterSlaveService
4 mediaserver 0.0 N N MasterSlaveService
history_event_state created_time updated_time group_type
0 A 1/8/18 19:49 1/8/18 19:49 N
1 A 1/8/18 20:09 1/8/18 20:09 NaN
2 A 1/9/18 20:31 1/9/18 20:31 N
3 A 1/11/18 17:16 1/11/18 17:16 N
4 A 1/15/18 22:55 1/15/18 22:55 N
代码:
# plot all continuous variables by all categorical variables
# get distributions
def plotter(ca_col,co_col,d_df,p_typ='boxplot'):
if ca_col is None:
for i in cont_col:
if p_typ=='boxplot':
data_df.boxplot(column=i)
elif p_typ=='hist':
data_df.hist(column=i)
elif ca_col!=None:
for j in ca_col:
for i in cont_col:
if p_typ=='boxplot':
data_df.boxplot(column=i,by=j)
elif p_typ=='hist':
data_df.hist(column=i,by=j)
# Create column for specified time periods in data
def ts_periods(f_nm, d_list, d_df):
t_df=d_df.copy()
for i in d_list:
if i=='year':
t_df[f_nm+'_year']=pd.DatetimeIndex(t_df[f_nm]).year
elif i=='month':
t_df[f_nm+'_month']=pd.DatetimeIndex(t_df[f_nm]).month
elif i=='weekday':
t_df[f_nm+'_weekday']=pd.DatetimeIndex(t_df[f_nm]).weekday_name
elif i=='week' in d_list:
t_df[f_nm+'_week']=pd.DatetimeIndex(t_df[f_nm]).week
elif i=='hour':
t_df[f_nm+'_hour']=pd.DatetimeIndex(t_df[f_nm]).hour
elif i=='minute':
t_df[f_nm+'_minute']=pd.DatetimeIndex(t_df[f_nm]).minute
return t_df
# Create distribution plots of continuous variables by time periods in data
def ts_plotter(ts_pdf, ts_plist, cont_list, plt_typ, tser):
ts_cat=[tser+'_'+i for i in ts_plist]
op_df=ts_periods(f_nm=tser, d_list=ts_plist, d_df=ts_pdf)
plotter(ca_col=ts_cat,co_col=cont_list,d_df=op_df,p_typ=plt_typ)
ts_plotter(ts_pdf=data_df, ts_plist=['year','minute'], cont_list=['duration'], plt_typ='boxplot', tser='updated_time')
错误:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-14-ee29a16ef8de> in <module>()
----> 1 ts_plotter(ts_pdf=data_df, ts_plist=['year','minute'], cont_list=['duration'], plt_typ='boxplot', tser='updated_time')
<ipython-input-12-c7fab8e851b9> in ts_plotter(ts_pdf, ts_plist, cont_list, plt_typ, tser)
5 op_df=ts_periods(f_nm=tser, d_list=ts_plist, d_df=ts_pdf)
6
----> 7 plotter(ca_col=ts_cat,co_col=cont_list,d_df=op_df,p_typ=plt_typ)
<ipython-input-7-f618189e8627> in plotter(ca_col, co_col, d_df, p_typ)
15 for i in cont_col:
16 if p_typ=='boxplot':
---> 17 data_df.boxplot(column=i,by=j)
18 elif p_typ=='hist':
19 data_df.hist(column=i,by=j)
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/plotting/_core.py in boxplot_frame(self, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds)
2255 ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize,
2256 grid=grid, rot=rot, figsize=figsize, layout=layout,
-> 2257 return_type=return_type, **kwds)
2258 plt.draw_if_interactive()
2259 return ax
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/plotting/_core.py in boxplot(data, column, by, ax, fontsize, rot, grid, figsize, layout, return_type, **kwds)
2224 by=by, grid=grid, figsize=figsize,
2225 ax=ax, layout=layout,
-> 2226 return_type=return_type)
2227 else:
2228 if return_type is None:
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/plotting/_core.py in _grouped_plot_by_column(plotf, data, columns, by, numeric_only, grid, figsize, ax, layout, return_type, **kwargs)
2664 figsize=None, ax=None, layout=None,
2665 return_type=None, **kwargs):
-> 2666 grouped = data.groupby(by)
2667 if columns is None:
2668 if not isinstance(by, (list, tuple)):
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, **kwargs)
6663 return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
6664 sort=sort, group_keys=group_keys, squeeze=squeeze,
-> 6665 observed=observed, **kwargs)
6666
6667 def asfreq(self, freq, method=None, how=None, normalize=False,
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in groupby(obj, by, **kwds)
2150 raise TypeError('invalid type: %s' % type(obj))
2151
-> 2152 return klass(obj, by, **kwds)
2153
2154
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, **kwargs)
597 sort=sort,
598 observed=observed,
--> 599 mutated=self.mutated)
600
601 self.obj = obj
~/anaconda2/envs/py36/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _get_grouper(obj, key, axis, level, sort, observed, mutated, validate)
3289 in_axis, name, level, gpr = False, None, gpr, None
3290 else:
-> 3291 raise KeyError(gpr)
3292 elif isinstance(gpr, Grouper) and gpr.key is not None:
3293 # Add key to exclusions
KeyError: 'updated_time_year'