我有一个数据框,我想按一列对其进行分组,同时对其应用许多功能。不幸的是,这只花了太长时间。我需要十倍的改进。我已经读过关于向量化的知识,但我失去了许多熊猫功能。
这是我的方法,首先定义所有需要的功能:
def f(x):
d = {}
d['min_min_approved'] = x['scoring_dol_amount'][x['payment_status']=='approved'].min()
d['max_max_approved'] = x['scoring_dol_amount'][x['payment_status']=='approved'].max()
d['sum_approved'] = x['scoring_dol_amount'][x['payment_status']=='approved'].sum()
d['avg_approved'] = x['scoring_dol_amount'][x['payment_status']=='approved'].mean()
d['std_approved'] = x['scoring_dol_amount'][x['payment_status']=='approved'].std()
d['sum_approved_tpn'] = x['scoring_dol_amount'][x['payment_status']=='approved'].count()
d['sum_rejected_tpn'] = x['scoring_dol_amount'][x['payment_status']=='rejected'].count()
d['sum_rejected_tpn_hr'] = x['scoring_dol_amount'][x['payment_status_detail']=='cc_rejected_high_risk'].count()
d['sum_rejected'] = x['scoring_dol_amount'][x['payment_status']=='rejected'].sum()
d['sum_rejected_hr'] = x['scoring_dol_amount'][x['payment_status_detail']=='cc_rejected_high_risk'].sum()
d['avg_rejected'] = x['scoring_dol_amount'][x['payment_status']=='rejected'].mean()
d['std_rejected'] = x['scoring_dol_amount'][x['payment_status']=='approved'].std()
d['sum_late_hours'] = x['scoring_dol_amount'][(x['payment_date_created'].dt.hour >=23) | (x['payment_date_created'].dt.hour <=6)].count()
#d['ratio_receive'] = (x['scoring_dol_amount'][x['payment_status']=='approved'].sum())/(x['scoring_dol_amount'][x['payment_status']=='rejected'].sum()+x['scoring_dol_amount'][x['payment_status']=='approved'].sum())
#d['ratio_receive_tpn'] = (x['scoring_dol_amount'][x['payment_status']=='approved'].count())/(x['scoring_dol_amount'][x['payment_status']=='rejected'].count()+x['scoring_dol_amount'][x['payment_status']=='approved'].count())
#d['distinct_tc']= x['tc'].nunique()
#d['distinct_doc']= x['payer_identification_number'].nunique()
#d['ratio_tc']= (x['tc'].nunique())/(x['scoring_dol_amount'][x['payment_status']=='approved'].count())
#d['ratio_doc']= (x['payer_identification_number'].nunique())/(x['scoring_dol_amount'][x['payment_status']=='approved'].count())
return pd.Series(d, index=['min_min_approved', 'max_max_approved', 'sum_approved', 'avg_approved','std_approved','sum_approved_tpn','sum_rejected_tpn','sum_rejected_tpn_hr','sum_rejected','sum_rejected_hr','avg_rejected','std_rejected','sum_late_hours'])#,'ratio_receive','ratio_receive_tpn','distinct_tc','distinct_doc','ratio_tc','ratio_doc'])
我正以这种方式应用它:
dataset_recibido=dataset_recibido.set_index('cust_id')
dataset_recibido.groupby(dataset_recibido.index).apply(f)
我该如何加快速度?