我需要与以下代码占用的数据帧相关时间相关的帮助。 完成大约2000条记录的数据集需要大约20秒。
def findRe(leaddatadf, keyAttributes, datadf):
for combs in itertools.combinations(atrList,
len(atrList)-1):
v_by =(set(atrList) - set(combs)) # varrying
grpdatapf=datadf.groupby(combs)
for name, group in grpdatapf:
if(group.shape[0]>1):
tmpgdf = leaddatadf[leaddatadf['unique_id'].astype(float).\
isin(group['unique_id'].astype(float))]
if(tmpgdf.shape[0]>1):
tmpgdf['mprice']=tmpgdf['mprice'].astype(float)
tmpgdf=tmpgdf.sort('mprice')
tmpgdf['id'] = tmpgdf['id']
tmpgdf['desc'] = tmpgdf['description']
tmpgdf['related_id'] = tmpgdf['id'].shift(-1)
tmpgdf['related_desc'] = tmpgdf['description'].shift(-1)
tmpgdf['related_mprice'] = tmpgdf['mprice'].shift(-1)
tmpgdf['pld'] = np.where(
(tmpgdf['related_price'].astype(float) > \
tmpgdf['mprice'].astype(float)),
(tmpgdf['related_price'].astype(float) - \
tmpgdf['mprice'].astype(float)) ,
(tmpgdf['mprice'].astype(float) - \
tmpgdf['related_mprice'].astype(float)))
tmpgdf['pltxt'] = np.where(
tmpgdf['related_mprice'].astype(float) - \
tmpgdf['mprice'].astype(float)>0.0,'<',
np.where(tmpgdf['related_mprice'].astype(float)\
- tmpgdf['mprice'].astype(float)<0,'>','='))
tmpgdf['prc_rlt_dif_nbr_p'] = abs(
(tmpgdf['pld'].astype(float) / \
((tmpgdf['mprice'].astype(float)))) )
tmpgdf['keyatr'] = str(atrList)
tmpgdf['varying'] = np.where(1==1,
"".join(v_by ),'')# varrying
temp = tmpgdf[['id',
'desc', 'related_id',
'related_desc', 'pltxt', 'pld',
'prc_rlt_dif_nbr_p', 'mprice', 'related_mprice',
'keyatr', 'varying']]
temp = temp[temp['related_mprice'].astype(float)>=0.0]
reldf.extend(list(temp.T.to_dict().values()))
return pd.DataFrame(
reldf, columns = ['id',
'desc', 'related_id',
'related_desc', 'pltxt', 'pld',
'prc_rlt_dif_nbr_p', 'mprice', 'related_mprice',
'keyatr', 'varying'])
答案 0 :(得分:0)
答案 1 :(得分:0)
您经常使用astype(float)
。每次使用它 - 都会创建该系列的副本。当您尝试加载数据框时,您可以尝试设置dtype=float
- 这样您只需将系列转换为浮动一次 - 而不是每次迭代:)
如果有帮助,请告诉我