我正在使用pandas
为我们的客户生成自定义报告。生成报告的流程如下:
pandas
函数进行解析,执行一些业务逻辑,并通过email
发送生成的报告。业务逻辑:
我们必须在数据中找到所有人的first-in
和last-out
时间。
df = pd.DataFrame(report_data)
df["Code"] = df["[People]Employee Code"]
df["Checkin At Date"] = pd.to_datetime(df["Checkin At Date"])
df["Checkin At Time"] = pd.to_datetime(df["Checkin At Time"])
grouped_data = df.groupby(["Mobile Number", "Checkin At Date"])
final_df = pd.DataFrame(columns=df.columns)
for key, temp_df in grouped_data:
try:
num_of_rows = len(temp_df.index)
temp_df.sort_values(
by=["Checkin At Time"], inplace=True, ascending=True
)
first_checkin_time = temp_df.loc[
temp_df.index[0], "Checkin At Time"
]
last_checkin_time = temp_df.loc[
temp_df.index[-1], "Checkin At Time"
]
temp_df['FIRST_BODY_TEMP'] = temp_df.loc[
temp_df.index[0], "Body Temp"
]
temp_df['LAST_BODY_TEMP'] = temp_df.loc[
temp_df.index[-1], "Body Temp"
]
if num_of_rows <= 1:
temp_df.loc['TOTAL_TIME'] = 0
temp_df.loc['LAST_OUT_TIME'] = None
else:
temp_df['TOTAL_TIME'] = last_checkin_time - first_checkin_time
temp_df['LAST_OUT_TIME'] = temp_df.loc[
temp_df.index[-1], "Checkin At Time"]
temp_df["TOTAL_TIME"] = pd.to_timedelta(
temp_df["TOTAL_TIME"], unit="s"
)
temp_df["TOTAL_TIME"] = temp_df["TOTAL_TIME"].apply(
lambda x: strfdelta(x, "{hours} Hrs {minutes} Min")
)
temp_df["LAST_OUT_TIME"] = temp_df["LAST_OUT_TIME"].dt.time
# Fetching the first row of the grouped data and appending to df
final_df = final_df.append(temp_df.iloc[0], ignore_index=True)
except Exception as e:
logger.exception(e)
continue
对于记录容量800 items
,这几乎要用18-20 seconds
。
def calculate_attendance(temp_df):
num_of_rows = len(temp_df.index)
temp_df.sort_values(by=['Checkin At Time'], inplace=True,
ascending=True)
first_checkin_time = temp_df.loc[temp_df.index[0], 'Checkin At Time'
]
last_checkin_time = temp_df.loc[temp_df.index[-1], 'Checkin At Time'
]
temp_df['FIRST_BODY_TEMP'] = temp_df.loc[temp_df.index[0],
'Body Temp']
temp_df['LAST_BODY_TEMP'] = temp_df.loc[temp_df.index[-1],
'Body Temp']
if num_of_rows <= 1:
temp_df.loc['TOTAL_TIME'] = 0
temp_df.loc['LAST_OUT_TIME'] = None
else:
temp_df['TOTAL_TIME'] = last_checkin_time - first_checkin_time
temp_df['LAST_OUT_TIME'] = temp_df.loc[temp_df.index[-1],
'Checkin At Time']
temp_df['TOTAL_TIME'] = pd.to_timedelta(temp_df['TOTAL_TIME'],
unit='s')
temp_df['TOTAL_TIME'] = temp_df['TOTAL_TIME'].apply(lambda x: \
strfdelta(x, '{hours} Hrs {minutes} Min'))
temp_df['LAST_OUT_TIME'] = temp_df['LAST_OUT_TIME'].dt.time
return temp_df.iloc[0]
final_df = pd.DataFrame(columns=df.columns)
final_df = final_df.groupby(["Mobile Number", "Checkin At Date"]).apply(calculate_attendance)
同一组重复出现,final_df
位于同一行。