我创建了一个函数,用于在计算的时间内按给定的列值(特定供应商或商店)计算已售商品的数量。 编写下面的函数的更有效方法是什么?
在大型数据集上,多次调用函数非常耗时。我在下面附上预期的结果。
我在示例中使用此数据:
df = pd.DataFrame({
'CreatedDate' : ['2019-03-02 18:03:08.690000',
'2019-02-01 21:22:43.109000',
'2019-03-09 21:21:44.337000',
'2019-04-04 21:45:59.988000',
'2019-03-15 18:53:17.683000',
'2019-03-03 00:24:28.003000',
'2019-04-14 22:01:20.759000',
'2019-02-26 19:28:01.800000',
'2019-03-21 21:23:09.629000',
'2019-02-03 00:55:51.756000'],
'Status' : ['resignation', 'sold','in delivery',
'sold','in delivery','resignation',
'sold','sold','sold','sold'],
'Vendor' :
['A','B','C','A','A','A','A','A','A','B'],
'Shop' :
['a','a','b','a','a','a','a','a','a','a']})
#fun
def add_count_values(df, month_start, month_stop, grouping_column, statuses, new_col):
df['CreatedDate'] = pd.to_datetime(df['CreatedDate'])
df['DateStartBack'] = df['CreatedDate']-pd.DateOffset(months=month_start)
df['DateStopBack'] = df['CreatedDate'] - pd.DateOffset(months=month_stop)
col = []
for i in df.index:
check_start = df.iloc[i]['DateStartBack']
check_stop = df.iloc[i]['DateStopBack']
check_group = df.iloc[i][grouping_column] # grouping column (shop or vendor)
count = len(df.loc[(df['CreatedDate'] > check_stop) &
(df['CreatedDate'] < check_start) &
(df.iloc[:][grouping_column] == check_group) &
(df['Status'].isin(statuses))]
)
col.append(count)
df = df.drop(columns=['DateStartBack', 'DateStopBack'])
df2 = pd.DataFrame({new_col: col})
return df.join(df2)
add_count_values(df, 0, 1, 'Vendor', ['sold','in delivery'], 'vendor_last_30days_sold')