处理1d数据帧/系列的熊猫

时间:2015-07-23 02:44:09

标签: python pandas

我希望比较客户购买的日期和之前的日期。我已遍历客户列表并成功执行了计算。当数据框中的客户只有一个条目时,就会出现问题。 我得到的1-D对象不像其他数据帧那样。我最初使用groupby来提供数据帧,但考虑到我必须“触摸”每一行并且该方法的性能非常慢,我开始采用下面的方法。

如何转换或处理这些1-d对象?

感谢任何帮助/想法/建议......谢谢

import pandas as pd
#import numpy as np

#dataframe data note: no_days_since_last_purchase hard coded for testing purposes
my_data = {'customer_id' : ['101A', '101A', '102A', '104B', '101A', '104B', '102A', '104B', '101A', '102A','101F' ],
          'date' : ['20120321','20120201','20120123','20111218','20111209','20111127','20111118','20111012','20111001','20110921','20110908'],
          'invoice_amt' : [654.76, 234.45, 99.45, 767.63, 124.76, 346.87, 652.65, 765.21, 275.76, 532.21,87.98 ],
          'no_days_since_last_purchase' : ['49', '54', '66', '21', '69', '46', '58', 'NaN', 'NaN', 'NaN','NaN']}

data_df = pd.DataFrame(my_data).sort_index(by='date',ascending=True)

#convert date str to date type
data_df['date'] = pd.to_datetime(data_df['date'].astype(str),format='%Y%m%d')

#Set index to customer and sort
data_df_by_customer = data_df.set_index('customer_id').sort_index()

#Get list of unique customers for use in the for loop
unique = pd.unique(data_df_by_customer.index.values.ravel())

df_container = []
#for each row in each dataframe calculate the difference in days between current and previous
#if there is no previous then use 2000-01-01 then convert to integer

unique = pd.unique(data_df_by_customer.index.values.ravel())

def add_days_since(frame,cust):
    l = len(frame.loc[cust].index)
    if isinstance(frame, pd.DataFrame):
        print l 
        print frame.loc[cust]
        pass
        #c = frame.loc[cust]
        #c['days_since'] = 0
        #df_container.append(c)

    else:
        #for each row in each dataframe calculate the difference in days between current and previous
        #if there is no previous then use 2000-01-01 then convert to integer
        c = frame.loc[cust].sort('date', ascending=True)
        c['days_since'] = (c['date'] - c['date'].shift().fillna(pd.datetime(2000,1,1))).astype('timedelta64[D]')
        df_container.append(c)
        print l

    return 


for customer in unique:
    add_days_since(data_df_by_customer,customer)

输出:

4
                  date  invoice_amt no_days_since_last_purchase
customer_id                                                    
101A        2012-02-01       234.45                          54
101A        2011-10-01       275.76                         NaN
101A        2011-12-09       124.76                          69
101A        2012-03-21       654.76                          49
3
date                           2011-09-08 00:00:00
invoice_amt                                  87.98
no_days_since_last_purchase                    NaN
Name: 101F, dtype: object
3
                  date  invoice_amt no_days_since_last_purchase
customer_id                                                    
102A        2011-09-21       532.21                         NaN
102A        2012-01-23        99.45                          66
102A        2011-11-18       652.65                          58
3
                  date  invoice_amt no_days_since_last_purchase
customer_id                                                    
104B        2011-12-18       767.63                          21
104B        2011-11-27       346.87                          46
104B        2011-10-12       765.21                         NaN
Wall time: 16 ms

1 个答案:

答案 0 :(得分:1)

import pandas as pd
import numpy as np

# your raw data without no_days_since_last_purchase
# ===========================================================
my_data = {'customer_id' : ['101A', '101A', '102A', '104B', '101A', '104B', '102A', '104B', '101A', '102A','101F' ],
          'date' : ['20120321','20120201','20120123','20111218','20111209','20111127','20111118','20111012','20111001','20110921','20110908'],
          'invoice_amt' : [654.76, 234.45, 99.45, 767.63, 124.76, 346.87, 652.65, 765.21, 275.76, 532.21,87.98 ]}

df = pd.DataFrame(my_data)
df['date'] = pd.to_datetime(df['date'])

df = df.sort('date')

   customer_id       date  invoice_amt
10        101F 2011-09-08        87.98
9         102A 2011-09-21       532.21
8         101A 2011-10-01       275.76
7         104B 2011-10-12       765.21
6         102A 2011-11-18       652.65
5         104B 2011-11-27       346.87
4         101A 2011-12-09       124.76
3         104B 2011-12-18       767.63
2         102A 2012-01-23        99.45
1         101A 2012-02-01       234.45
0         101A 2012-03-21       654.76

# processing
# ===================================
def func(group):
    group['no_days_since_last_purchase'] = np.diff(np.insert(group['date'].values, 0, np.datetime64('2001-01-01')))/np.timedelta64(1, 'D')
    return group

df.groupby('customer_id').apply(func)

   customer_id       date  invoice_amt  no_days_since_last_purchase
10        101F 2011-09-08        87.98                         3902
9         102A 2011-09-21       532.21                         3915
8         101A 2011-10-01       275.76                         3925
7         104B 2011-10-12       765.21                         3936
6         102A 2011-11-18       652.65                           58
5         104B 2011-11-27       346.87                           46
4         101A 2011-12-09       124.76                           69
3         104B 2011-12-18       767.63                           21
2         102A 2012-01-23        99.45                           66
1         101A 2012-02-01       234.45                           54
0         101A 2012-03-21       654.76                           49