我的数据集如下所示:https://www.dropbox.com/s/u4brzjnhac0pwnj/TEST.xlsx?dl=0
我需要将原始表中的数据转换为附加文件中所需表格中的数据。
我有一套家庭用户(HH),其中包含每个家庭1到7个月的数据,我希望每个HH /月我都有与之前3个月相对应的数据,与此记录相邻并且位于相同的位置行。每个家庭的所有月份都要这样做。
在文本中解释是一个复杂的问题,我认为看一下这些数据是可以解释的。
我为它编写了一些代码,这是非常不灵活的,并且遍历数据集的所有5mn记录。花几天时间,这可以通过更有效的方式完成。
import pandas as pd
import os
os.chdir(r'H:\shared\tran')
c=pd.read_csv(r'0.csv')
c['_prev1_month_id']=''
c['_prev1_tuned_duration']=''
c['_prev1_weekend_tuned_duration']=''
c['_prev1_channel_flips']=''
c['_prev1_most_common_daypart']=''
c['_prev1_programs_watched_per_hh']=''
c['_prev1_midnight']=''
c['_prev1_morning']=''
c['_prev1_afternoon']=''
c['_prev1_evening']=''
c['_prev2_month_id']=''
c['_prev2_tuned_duration']=''
c['_prev2_weekend_tuned_duration']=''
c['_prev2_channel_flips']=''
c['_prev2_most_common_daypart']=''
c['_prev2_programs_watched_per_hh']=''
c['_prev2_midnight']=''
c['_prev2_morning']=''
c['_prev2_afternoon']=''
c['_prev2_evening']=''
c['_prev3_month_id']=''
c['_prev3_tuned_duration']=''
c['_prev3_weekend_tuned_duration']=''
c['_prev3_channel_flips']=''
c['_prev3_most_common_daypart']=''
c['_prev3_programs_watched_per_hh']=''
c['_prev3_midnight']=''
c['_prev3_morning']=''
c['_prev3_afternoon']=''
c['_prev3_evening']=''
def tran(v):
for i in v.month_id:
if i>3:
ind=v[v.month_id==i].index[0]
j=i-1
#print ('Doing m:',j,' ind:',v[v.month_id==i]['month_id'].get_values()[0])
print ('index :',ind)
try:
c.ix[ind,'_prev1_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
c.ix[ind,'_prev1_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
c.ix[ind,'_prev1_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
c.ix[ind,'_prev1_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
c.ix[ind,'_prev1_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev1_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev1_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
c.ix[ind,'_prev1_morning']=v[v.month_id==j]['morning'].get_values()[0]
c.ix[ind,'_prev1_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
c.ix[ind,'_prev1_evening']=v[v.month_id==j]['evening'].get_values()[0]
except :
#print ('No record found for HH ',v.household_id,' and month ',j)
pass
j=j-1
try:
c.ix[ind,'_prev2_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
c.ix[ind,'_prev2_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
c.ix[ind,'_prev2_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
c.ix[ind,'_prev2_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
c.ix[ind,'_prev2_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev2_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev2_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
c.ix[ind,'_prev2_morning']=v[v.month_id==j]['morning'].get_values()[0]
c.ix[ind,'_prev2_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
c.ix[ind,'_prev2_evening']=v[v.month_id==j]['evening'].get_values()[0]
except:
#print ('No record found for HH ',v.household_id,' and month ',j)
pass
j=j-1
try:
c.ix[ind,'_prev3_month_id']=v[v.month_id==j]['month_id'].get_values()[0]
c.ix[ind,'_prev3_tuned_duration']=v[v.month_id==j]['tuned_duration'].get_values()[0]
c.ix[ind,'_prev3_weekend_tuned_duration']=v[v.month_id==j]['weekend_tuned_duration'].get_values()[0]
c.ix[ind,'_prev3_channel_flips']=v[v.month_id==j]['channel_flips'].get_values()[0]
c.ix[ind,'_prev3_most_common_daypart']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev3_programs_watched_per_hh']=v[v.month_id==j]['most_common_daypart'].get_values()[0]
c.ix[ind,'_prev3_midnight']=v[v.month_id==j]['midnight'].get_values()[0]
c.ix[ind,'_prev3_morning']=v[v.month_id==j]['morning'].get_values()[0]
c.ix[ind,'_prev3_afternoon']=v[v.month_id==j]['afternoon'].get_values()[0]
c.ix[ind,'_prev3_evening']=v[v.month_id==j]['evening'].get_values()[0]
except:
pass
#print ('No record found for HH ',v.household_id,' and month ',j)
else:
#print ('Ignored for HH ',v.household_id,' and month ',j)
pass
z.head()
m=0
for k in z.household_id.unique():
for i in list(z[z['household_id']==k].month_id):
if i >3:
j=i-1
#index of original row
ind=z[(z.household_id==k) & (z.month_id==i)].index[0]
print ('Doing : hh:',k,' m:',i,' ind:',ind)
try:
z['_prev1_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0]
z['_prev1_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]
z['_prev1_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0]
z['_prev1_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0]
z['_prev1_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0]
z['_prev1_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0]
z['_prev1_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0]
z['_prev1_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0]
z['_prev1_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0]
z['_prev1_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
except :
print ('No record found for HH ',k,' and month ',j)
j=j-1
try:
z['_prev2_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0]
z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]
z['_prev2_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0]
z['_prev2_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0]
z['_prev2_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0]
z['_prev2_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0]
z['_prev2_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0]
z['_prev2_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0]
z['_prev2_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0]
z['_prev2_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
except:
print ('No record found for HH ',k,' and month ',j)
j=j-1
try:
z['_prev3_month_id'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['month_id'].get_values()[0]
z['_prev3_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['tuned_duration'].get_values()[0]
z['_prev3_weekend_tuned_duration'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['weekend_tuned_duration'].get_values()[0]
z['_prev3_channel_flips'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['channel_flips'].get_values()[0]
z['_prev3_most_common_daypart'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['most_common_daypart'].get_values()[0]
z['_prev3_programs_watched_per_hh'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['programs_watched_per_hh'].get_values()[0]
z['_prev3_midnight'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['midnight'].get_values()[0]
z['_prev3_morning'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['morning'].get_values()[0]
z['_prev3_afternoon'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['afternoon'].get_values()[0]
z['_prev3_evening'].ix[ind]=z[(z.household_id==k) & (z.month_id==j)]['evening'].get_values()[0]
except:
print ('No record found for HH ',k,' and month ',j)
else:
print ('Ignored for HH ',k,' and month ',j)
调用该函数:
#c.groupby('household_id').apply(tran)
答案 0 :(得分:0)
虽然我无法在python中更快地完成它,但我能够使用分析功能在sql中快速完成。
select month_id, household_id, tuned_duration,weekend_tuned_duration,channel_flips,most_common_daypart,
programs_watched_per_hh,trend,midnight,morning,afternoon,evening,
--rank() over (partition by month_id desc, household_id order by month_id desc desc
lead (month_id) over (partition by household_id order by month_id desc) as month_id_1,
lead (household_id) over (partition by household_id order by month_id desc) as household_id_1,
lead (tuned_duration) over (partition by household_id order by month_id desc) as tuned_duration_1,
lead (weekend_tuned_duration) over (partition by household_id order by month_id desc) as weekend_tuned_duration_1,
lead (channel_flips) over (partition by household_id order by month_id desc) as channel_flips_1,
lead (most_common_daypart) over (partition by household_id order by month_id desc) as most_common_daypart_1,
lead (programs_watched_per_hh) over (partition by household_id order by month_id desc) as program_watched_per_hh_1,
lead (trend) over (partition by household_id order by month_id desc) as trend_1,
lead (midnight) over (partition by household_id order by month_id desc) as midnight_1,
lead (morning) over (partition by household_id order by month_id desc) as morning_1,
lead (afternoon) over (partition by household_id order by month_id desc) as afternoon_1,
lead (evening) over (partition by household_id order by month_id desc) as evening_1,
lead (month_id,2) over (partition by household_id order by month_id desc) as month_id_2,
lead (household_id,2) over (partition by household_id order by month_id desc) as household_id_2,
lead (tuned_duration,2) over (partition by household_id order by month_id desc) as tuned_duration_2,
lead (weekend_tuned_duration,2) over (partition by household_id order by month_id desc) as weekend_tuned_duration_2,
lead (channel_flips,2) over (partition by household_id order by month_id desc) as channel_flips_2,
lead (most_common_daypart,2) over (partition by household_id order by month_id desc) as most_common_daypart_2,
lead (programs_watched_per_hh,2) over (partition by household_id order by month_id desc) as program_watched_per_hh_2,
lead (trend,2) over (partition by household_id order by month_id desc) as trend_2,
lead (midnight,2) over (partition by household_id order by month_id desc) as midnight_2,
lead (morning,2) over (partition by household_id order by month_id desc) as morning_2,
lead (afternoon,2) over (partition by household_id order by month_id desc) as afternoon_2,
lead (evening,2) over (partition by household_id order by month_id desc) as evening_2,
lead (month_id,3) over (partition by household_id order by month_id desc) as month_id_3,
lead (household_id,3) over (partition by household_id order by month_id desc) as household_id_3,
lead (tuned_duration,3) over (partition by household_id order by month_id desc) as tuned_duration_3,
lead (weekend_tuned_duration,3) over (partition by household_id order by month_id desc) as weekend_tuned_duration_3,
lead (channel_flips,3) over (partition by household_id order by month_id desc) as channel_flips_3,
lead (most_common_daypart,3) over (partition by household_id order by month_id desc) as most_common_daypart_3,
lead (programs_watched_per_hh,3) over (partition by household_id order by month_id desc) as program_watched_per_hh_3,
lead (trend,3) over (partition by household_id order by month_id desc) as trend_3,
lead (midnight,3) over (partition by household_id order by month_id desc) as midnight_3,
lead (morning,3) over (partition by household_id order by month_id desc) as morning_3,
lead (afternoon,3) over (partition by household_id order by month_id desc) as afternoon_3,
lead (evening,3) over (partition by household_id order by month_id desc) as evening_3
from
table