Python中的复杂数据转换

时间:2016-07-13 18:02:38

标签: python pandas machine-learning

我的数据集如下所示:https://www.dropbox.com/s/u4brzjnhac0pwnj/TEST.xlsx?dl=0

我需要将原始表中的数据转换为附加文件中所需表格中的数据。

我有一套家庭用户(HH),其中包含每个家庭1到7个月的数据,我希望每个HH /月我都有与之前3个月相对应的数据,与此记录相邻并且位于相同的位置行。每个家庭的所有月份都要这样做。

在文本中解释是一个复杂的问题,我认为看一下这些数据是可以解释的。

我为它编写了一些代码,这是非常不灵活的,并且遍历数据集的所有5mn记录。花几天时间,这可以通过更有效的方式完成。

    import pandas as pd
import os

os.chdir(r'H:\shared\tran')

c=pd.read_csv(r'0.csv')


c['_prev1_month_id']=''                     
c['_prev1_tuned_duration']=''               
c['_prev1_weekend_tuned_duration']=''       
c['_prev1_channel_flips']=''                
c['_prev1_most_common_daypart']=''          
c['_prev1_programs_watched_per_hh']=''      
c['_prev1_midnight']=''                     
c['_prev1_morning']=''                      
c['_prev1_afternoon']=''                    
c['_prev1_evening']=''  
c['_prev2_month_id']=''                     
c['_prev2_tuned_duration']=''               
c['_prev2_weekend_tuned_duration']=''       
c['_prev2_channel_flips']=''                
c['_prev2_most_common_daypart']=''          
c['_prev2_programs_watched_per_hh']=''      
c['_prev2_midnight']=''                     
c['_prev2_morning']=''                      
c['_prev2_afternoon']=''                    
c['_prev2_evening']=''                      
c['_prev3_month_id']=''                     
c['_prev3_tuned_duration']=''               
c['_prev3_weekend_tuned_duration']=''       
c['_prev3_channel_flips']=''                
c['_prev3_most_common_daypart']=''          
c['_prev3_programs_watched_per_hh']=''      
c['_prev3_midnight']=''                     
c['_prev3_morning']=''                      
c['_prev3_afternoon']=''                    
c['_prev3_evening']='' 




def tran(v): 

    for i in v.month_id: 
        if i>3:         
            ind=v[v.month_id==i].index[0]   
            j=i-1
            #print ('Doing   m:',j,'  ind:',v[v.month_id==i]['month_id'].get_values()[0])
            print ('index :',ind)
            try:
                c.ix[ind,'_prev1_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
                c.ix[ind,'_prev1_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
                c.ix[ind,'_prev1_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
                c.ix[ind,'_prev1_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
                c.ix[ind,'_prev1_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
                c.ix[ind,'_prev1_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
                c.ix[ind,'_prev1_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
                c.ix[ind,'_prev1_morning']=v[v.month_id==j]['morning'].get_values()[0]
                c.ix[ind,'_prev1_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
                c.ix[ind,'_prev1_evening']=v[v.month_id==j]['evening'].get_values()[0]      
            except :
                #print ('No record found for HH ',v.household_id,' and month ',j)
                pass

            j=j-1
            try:
                c.ix[ind,'_prev2_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
                c.ix[ind,'_prev2_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
                c.ix[ind,'_prev2_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
                c.ix[ind,'_prev2_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
                c.ix[ind,'_prev2_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
                c.ix[ind,'_prev2_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
                c.ix[ind,'_prev2_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
                c.ix[ind,'_prev2_morning']=v[v.month_id==j]['morning'].get_values()[0]
                c.ix[ind,'_prev2_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
                c.ix[ind,'_prev2_evening']=v[v.month_id==j]['evening'].get_values()[0]            
            except:
                #print ('No record found for HH ',v.household_id,' and month ',j)
                pass

            j=j-1
            try:            
                c.ix[ind,'_prev3_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
                c.ix[ind,'_prev3_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
                c.ix[ind,'_prev3_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
                c.ix[ind,'_prev3_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
                c.ix[ind,'_prev3_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
                c.ix[ind,'_prev3_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
                c.ix[ind,'_prev3_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
                c.ix[ind,'_prev3_morning']=v[v.month_id==j]['morning'].get_values()[0]
                c.ix[ind,'_prev3_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
                c.ix[ind,'_prev3_evening']=v[v.month_id==j]['evening'].get_values()[0]  
            except:
                pass
                #print ('No record found for HH ',v.household_id,' and month ',j)


        else:
            #print ('Ignored for HH ',v.household_id,' and month ',j)
            pass


z.head()
m=0
for k in z.household_id.unique():
    for i in list(z[z['household_id']==k].month_id):
        if i >3:
            j=i-1

            #index of original row
            ind=z[(z.household_id==k) & (z.month_id==i)].index[0]
            print ('Doing : hh:',k,'  m:',i,'  ind:',ind)
            try:
                z['_prev1_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0] 
                z['_prev1_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]                 
                z['_prev1_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0] 
                z['_prev1_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0] 
                z['_prev1_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0] 
                z['_prev1_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0] 
                z['_prev1_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0] 
                z['_prev1_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0] 
                z['_prev1_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0] 
                z['_prev1_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0] 

            except :
                print ('No record found for HH ',k,' and month ',j)
            j=j-1
            try:
                z['_prev2_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0] 
                z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]                 
                z['_prev2_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0] 
                z['_prev2_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0] 
                z['_prev2_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0] 
                z['_prev2_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0] 
                z['_prev2_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0] 
                z['_prev2_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0] 
                z['_prev2_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0] 
                z['_prev2_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0] 

            except:
                print ('No record found for HH ',k,' and month ',j)

            j=j-1
            try:            
                z['_prev3_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0] 
                z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]                 
                z['_prev3_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0] 
                z['_prev3_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0] 
                z['_prev3_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0] 
                z['_prev3_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0] 
                z['_prev3_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0] 
                z['_prev3_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0] 
                z['_prev3_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0] 
                z['_prev3_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
            except:
                print ('No record found for HH ',k,' and month ',j)


        else:
            print ('Ignored for HH ',k,' and month ',j)

调用该函数:

#c.groupby('household_id').apply(tran)

1 个答案:

答案 0 :(得分:0)

虽然我无法在python中更快地完成它,但我能够使用分析功能在sql中快速完成。

    select month_id, household_id, tuned_duration,weekend_tuned_duration,channel_flips,most_common_daypart,
programs_watched_per_hh,trend,midnight,morning,afternoon,evening,
--rank() over (partition by month_id desc, household_id order by month_id desc desc 
lead (month_id) over (partition by household_id order by month_id desc) as month_id_1,
lead (household_id) over (partition by household_id order by month_id desc) as household_id_1,
lead (tuned_duration) over (partition by household_id order by month_id desc) as tuned_duration_1,
lead (weekend_tuned_duration) over (partition by household_id order by month_id desc) as weekend_tuned_duration_1,
lead (channel_flips) over (partition by household_id order by month_id desc) as channel_flips_1,
lead (most_common_daypart) over (partition by household_id order by month_id desc) as most_common_daypart_1,
lead (programs_watched_per_hh) over (partition by household_id order by month_id desc) as program_watched_per_hh_1,
lead (trend) over (partition by household_id order by month_id desc) as trend_1,
lead (midnight) over (partition by household_id order by month_id desc) as midnight_1,
lead (morning) over (partition by household_id order by month_id desc) as morning_1,
lead (afternoon) over (partition by household_id order by month_id desc) as afternoon_1,
lead (evening) over (partition by household_id order by month_id desc) as evening_1,
lead (month_id,2) over (partition by household_id order by month_id desc) as month_id_2,
lead (household_id,2) over (partition by household_id order by month_id desc) as household_id_2,
lead (tuned_duration,2) over (partition by household_id order by month_id desc) as tuned_duration_2,
lead (weekend_tuned_duration,2) over (partition by household_id order by month_id desc) as weekend_tuned_duration_2,
lead (channel_flips,2) over (partition by household_id order by month_id desc) as channel_flips_2,
lead (most_common_daypart,2) over (partition by household_id order by month_id desc) as most_common_daypart_2,
lead (programs_watched_per_hh,2) over (partition by household_id order by month_id desc) as program_watched_per_hh_2,
lead (trend,2) over (partition by household_id order by month_id desc) as trend_2,
lead (midnight,2) over (partition by household_id order by month_id desc) as midnight_2,
lead (morning,2) over (partition by household_id order by month_id desc) as morning_2,
lead (afternoon,2) over (partition by household_id order by month_id desc) as afternoon_2,
lead (evening,2) over (partition by household_id order by month_id desc) as evening_2,
lead (month_id,3) over (partition by household_id order by month_id desc) as month_id_3,
lead (household_id,3) over (partition by household_id order by month_id desc) as household_id_3,
lead (tuned_duration,3) over (partition by household_id order by month_id desc) as tuned_duration_3,
lead (weekend_tuned_duration,3) over (partition by household_id order by month_id desc) as weekend_tuned_duration_3,
lead (channel_flips,3) over (partition by household_id order by month_id desc) as channel_flips_3,
lead (most_common_daypart,3) over (partition by household_id order by month_id desc) as most_common_daypart_3,
lead (programs_watched_per_hh,3) over (partition by household_id order by month_id desc) as program_watched_per_hh_3,
lead (trend,3) over (partition by household_id order by month_id desc) as trend_3,
lead (midnight,3) over (partition by household_id order by month_id desc) as midnight_3,
lead (morning,3) over (partition by household_id order by month_id desc) as morning_3,
lead (afternoon,3) over (partition by household_id order by month_id desc) as afternoon_3,
lead (evening,3) over (partition by household_id order by month_id desc) as evening_3
from
 table