df = pd.DataFrame({'Campaign ID':[48464,48464,48464,48464,26380,26380,22676,39529,39529,46029,46029,46029,17030,46724,46724,39379,39379,39379],
'Campaign stage':["Lost","Developing","Discussing","Starting","Discussing", "Starting","Developing", "Discussing","Starting","Developing", "Discussing","Starting","Developing", "Developing","Discussing","Lost", "Developing","Discussing"],
'Stage Number':[-1, 3, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 3, 2, -1, 3, 2],
'Campaign Date':["2/8/2019","1/9/2019","1/3/2019","3/3/2018","2/14/2019","12/5/2018","7/25/2018","6/8/2018","3/4/2018","12/8/2018","9/9/2018","5/31/2018","6/7/2018","3/27/2018","1/6/2018","2/15/2019","12/15/2018","9/4/2018"]})
pvt = pd.pivot_table(df,values=['Campaign stage'],index=['Campaign ID','Campaign stage','Stage Number','Campaign Date'],aggfunc='count')
pvt.sort_values(['Campaign ID','Campaign Date'],ascending=[True,False])
大家好,我具有上面的数据框,我想计算每个广告系列在广告系列“开始”和“讨论”之间的天数,然后计算平均值。
由于数据质量,活动阶段不一致。因此,对于广告系列没有“开始”和“讨论”两个阶段的情况,我想将其设置为0。
我创建了数据的数据透视表视图,并对广告系列日期的降序进行了排序...但是我不知道下一步该怎么做。
预先感谢您的帮助。
答案 0 :(得分:0)
这是我的建议,从您的df开始:
df['Campaign Date'] = pd.to_datetime(df['Campaign Date'])
df = df[df['Campaign stage'].isin(['Starting', 'Discussing'])]
pvt = pd.pivot_table(df,values=['Campaign stage'],index=['Campaign ID','Campaign stage','Stage Number','Campaign Date'],aggfunc='count')
pvt = pvt.sort_values(['Campaign ID','Campaign Date'],ascending=True).reset_index(level=3)
pvt.groupby(level=0).diff().sum(level=0)
输出为:
Campaign ID - Campaign Date
26380 71 days
39379 0 days
39529 96 days
46029 101 days
46724 0 days
48464 306 days
如果您要查找的是总均值:
pvt.groupby(level=0).diff().sum(level=0).mean()
哪个是
95 days 16:00:00
答案 1 :(得分:0)
我的解决方案消除了对表格视图的需求。
df['Campaign Date'] = pd.to_datetime(df['Campaign Date'])
days = []
for ID in df['Campaign ID'].unique():
try:
a = df[(df['Campaign ID'] == ID) & (df['Campaign stage'] == "Starting")].iloc[0]["Campaign Date"]
b = df[(df['Campaign ID'] == ID) & (df['Campaign stage'] == "Discussing")].iloc[0]["Campaign Date"]
days.append((b - a).days)
except:
days.append(0)
average = sum(days) / len(days)
答案 2 :(得分:0)
df['Campaign Date'] = pd.to_datetime(df['Campaign Date'],format='%m/%d/%Y')
compare= {}
for ids,gp in df.groupby('Campaign ID'):
try:
compare[ids]= gp.loc[gp['Campaign stage']=='Discussing']['Campaign Date'].iloc[0] - gp.loc[gp['Campaign stage']=='Starting']['Campaign Date'].iloc[0]
except:
compare[ids] =0
df['new_col'] = df['Campaign ID'].apply(lambda x:compare[x])
答案 3 :(得分:0)
我不确定您希望最终结果采用哪种形式,下面的示例假定您希望在新的数据框中使用它。
不过,我们可以走多条路线,但是我选择在下面创建一个函数,然后为了清楚起见而分解每个步骤。
datetime
进行比较starting
和discussing
之间的天数
cid
并遍历每个cid
是否同时具有我们感兴趣的两个阶段
cid
分配0 discussing
和starting
之间的差异df.loc
创建并计算我们的平均日索引代码
# First we need to modify your data type to compare the dates
df['Campaign Date'] = pd.to_datetime(df['Campaign Date'])
df_days = getDays(df)
# We create a new index called 'Average' that will assign values to the days column
df_days.loc['Average','days'] = df_days.days.mean()
def getDays(df):
days = list()
cids = list()
for cid in df['Campaign ID'].unique():
stages = df[(df['Campaign ID'] == cid) &
(df['Campaign stage'].str.lower().isin(['discussing', 'starting']))]['Campaign stage'].unique()
if len(stages) < 2:
d = 0
c = cid
days.append(d)
cids.append(c)
else:
starting = df[(df['Campaign ID'] == cid) & (df['Campaign stage'].str.lower() == 'starting')]['Campaign Date'].dt.date.values
discussing = df[(df['Campaign ID'] == cid) & (df['Campaign stage'].str.lower() == 'discussing')]['Campaign Date'].dt.date.values
d = (discussing - starting)[0].days
c = cid
days.append(d)
cids.append(c)
dff = pd.DataFrame({
'cids': cids,
'days': days
})
return dff
退出
cids days
0 48464.0 306.00
1 26380.0 71.00
2 22676.0 0.00
3 39529.0 96.00
4 46029.0 101.00
5 17030.0 0.00
6 46724.0 0.00
7 39379.0 0.00
Average NaN 71.75