我希望比较客户购买的日期和之前的日期。我已遍历客户列表并成功执行了计算。当数据框中的客户只有一个条目时,就会出现问题。 我得到的1-D对象不像其他数据帧那样。我最初使用groupby来提供数据帧,但考虑到我必须“触摸”每一行并且该方法的性能非常慢,我开始采用下面的方法。
如何转换或处理这些1-d对象?
感谢任何帮助/想法/建议......谢谢
import pandas as pd
#import numpy as np
#dataframe data note: no_days_since_last_purchase hard coded for testing purposes
my_data = {'customer_id' : ['101A', '101A', '102A', '104B', '101A', '104B', '102A', '104B', '101A', '102A','101F' ],
'date' : ['20120321','20120201','20120123','20111218','20111209','20111127','20111118','20111012','20111001','20110921','20110908'],
'invoice_amt' : [654.76, 234.45, 99.45, 767.63, 124.76, 346.87, 652.65, 765.21, 275.76, 532.21,87.98 ],
'no_days_since_last_purchase' : ['49', '54', '66', '21', '69', '46', '58', 'NaN', 'NaN', 'NaN','NaN']}
data_df = pd.DataFrame(my_data).sort_index(by='date',ascending=True)
#convert date str to date type
data_df['date'] = pd.to_datetime(data_df['date'].astype(str),format='%Y%m%d')
#Set index to customer and sort
data_df_by_customer = data_df.set_index('customer_id').sort_index()
#Get list of unique customers for use in the for loop
unique = pd.unique(data_df_by_customer.index.values.ravel())
df_container = []
#for each row in each dataframe calculate the difference in days between current and previous
#if there is no previous then use 2000-01-01 then convert to integer
unique = pd.unique(data_df_by_customer.index.values.ravel())
def add_days_since(frame,cust):
l = len(frame.loc[cust].index)
if isinstance(frame, pd.DataFrame):
print l
print frame.loc[cust]
pass
#c = frame.loc[cust]
#c['days_since'] = 0
#df_container.append(c)
else:
#for each row in each dataframe calculate the difference in days between current and previous
#if there is no previous then use 2000-01-01 then convert to integer
c = frame.loc[cust].sort('date', ascending=True)
c['days_since'] = (c['date'] - c['date'].shift().fillna(pd.datetime(2000,1,1))).astype('timedelta64[D]')
df_container.append(c)
print l
return
for customer in unique:
add_days_since(data_df_by_customer,customer)
输出:
4
date invoice_amt no_days_since_last_purchase
customer_id
101A 2012-02-01 234.45 54
101A 2011-10-01 275.76 NaN
101A 2011-12-09 124.76 69
101A 2012-03-21 654.76 49
3
date 2011-09-08 00:00:00
invoice_amt 87.98
no_days_since_last_purchase NaN
Name: 101F, dtype: object
3
date invoice_amt no_days_since_last_purchase
customer_id
102A 2011-09-21 532.21 NaN
102A 2012-01-23 99.45 66
102A 2011-11-18 652.65 58
3
date invoice_amt no_days_since_last_purchase
customer_id
104B 2011-12-18 767.63 21
104B 2011-11-27 346.87 46
104B 2011-10-12 765.21 NaN
Wall time: 16 ms
答案 0 :(得分:1)
import pandas as pd
import numpy as np
# your raw data without no_days_since_last_purchase
# ===========================================================
my_data = {'customer_id' : ['101A', '101A', '102A', '104B', '101A', '104B', '102A', '104B', '101A', '102A','101F' ],
'date' : ['20120321','20120201','20120123','20111218','20111209','20111127','20111118','20111012','20111001','20110921','20110908'],
'invoice_amt' : [654.76, 234.45, 99.45, 767.63, 124.76, 346.87, 652.65, 765.21, 275.76, 532.21,87.98 ]}
df = pd.DataFrame(my_data)
df['date'] = pd.to_datetime(df['date'])
df = df.sort('date')
customer_id date invoice_amt
10 101F 2011-09-08 87.98
9 102A 2011-09-21 532.21
8 101A 2011-10-01 275.76
7 104B 2011-10-12 765.21
6 102A 2011-11-18 652.65
5 104B 2011-11-27 346.87
4 101A 2011-12-09 124.76
3 104B 2011-12-18 767.63
2 102A 2012-01-23 99.45
1 101A 2012-02-01 234.45
0 101A 2012-03-21 654.76
# processing
# ===================================
def func(group):
group['no_days_since_last_purchase'] = np.diff(np.insert(group['date'].values, 0, np.datetime64('2001-01-01')))/np.timedelta64(1, 'D')
return group
df.groupby('customer_id').apply(func)
customer_id date invoice_amt no_days_since_last_purchase
10 101F 2011-09-08 87.98 3902
9 102A 2011-09-21 532.21 3915
8 101A 2011-10-01 275.76 3925
7 104B 2011-10-12 765.21 3936
6 102A 2011-11-18 652.65 58
5 104B 2011-11-27 346.87 46
4 101A 2011-12-09 124.76 69
3 104B 2011-12-18 767.63 21
2 102A 2012-01-23 99.45 66
1 101A 2012-02-01 234.45 54
0 101A 2012-03-21 654.76 49