data={'id':[1,1,1,1,2,2,2,2],
'date1':[datetime.date(2016,1,1),datetime.date(2016,1,2),datetime.date(2016,1,2),datetime.date(2016,1,4),
datetime.date(2016,1,2),datetime.date(2016,1,4),datetime.date(2016,1,3),datetime.date(2016,1,1)],
'date2':[datetime.date(2016,1,5),datetime.date(2016,1,3),datetime.date(2016,1,5),datetime.date(2016,1,5),
datetime.date(2016,1,4),datetime.date(2016,1,5),datetime.date(2016,1,4),datetime.date(2016,1,1)],
'score1':[5,7,3,2,9,3,8,3],
'score2':[1,3,0,5,2,20,7,7]}
df=pd.DataFrame.from_dict(data)
数据帧df如下所示:
id date1 date2 score1 score2
0 1 2016-01-01 2016-01-05 5 1
1 1 2016-01-02 2016-01-03 7 3
2 1 2016-01-02 2016-01-05 3 0
3 1 2016-01-04 2016-01-05 2 5
4 2 2016-01-02 2016-01-04 9 2
5 2 2016-01-04 2016-01-05 3 20
6 2 2016-01-03 2016-01-04 8 7
7 2 2016-01-01 2016-01-01 3 7
另一个包含ID usetdate的数据框UF
id usetdate
0 1 2016-01-01
1 1 2016-01-03
2 2 2016-01-04
3 2 2016-01-02
如果我传递的日期(useddate)在date1和date2之间,我想针对UF的每个ID查找score1和score2之和
id usetdate score1 score2
0 1 2016-01-01 5 1
1 1 2016-01-03 17 9
2 2 2016-01-04 20 29
3 2 2016-01-02 9 2
答案 0 :(得分:1)
您可以先创建所有日期时间为date_range
的Series
,然后将索引Series
和DataFrame.join
中的值交换为原始索引,最后汇总为sum
:< / p>
s = pd.concat([pd.Series(r.Index,pd.date_range(r.date1, r.date2)) for r in df.itertuples()])
s = pd.Series(s.index, index=s, name='usetdate')
df = df.drop(['date1','date2'],axis=1).join(s).groupby(['id','usetdate'], as_index=False).sum()
print (df)
id usetdate score1 score2
0 1 2016-01-01 5 1
1 1 2016-01-02 15 4
2 1 2016-01-03 15 4
3 1 2016-01-04 10 6
4 1 2016-01-05 10 6
5 2 2016-01-01 3 7
6 2 2016-01-02 9 2
7 2 2016-01-03 17 9
8 2 2016-01-04 20 29
9 2 2016-01-05 3 20
编辑:
L = [(i, d, s1, s2) for i, d1, d2, s1, s2 in df.values for d in pd.date_range(d1, d2)]
df = (pd.DataFrame(L, columns=['id','usetdate','score1','score2'])
.groupby(['id','usetdate'], as_index=False).sum())
print (df)
id usetdate score1 score2
0 1 2016-01-01 5 1
1 1 2016-01-02 15 4
2 1 2016-01-03 15 4
3 1 2016-01-04 10 6
4 1 2016-01-05 10 6
5 2 2016-01-01 3 7
6 2 2016-01-02 9 2
7 2 2016-01-03 17 9
8 2 2016-01-04 20 29
9 2 2016-01-05 3 20
编辑:
在汇总之前,您可以使用左连接的merge
值:
df1['userdate'] = pd.to_datetime(df1['userdate'])
print (df1)
id userdate
0 1 2016-01-01
1 1 2016-01-03
2 2 2016-01-04
3 2 2016-01-02
L = [(i, d, s1, s2) for i, d1, d2, s1, s2 in df.values for d in pd.date_range(d1, d2)]
df = (pd.DataFrame(L, columns=['id','userdate','score1','score2'])
.merge(df1)
.groupby(['id','userdate'], as_index=False)
.sum())
print (df)
id userdate score1 score2
0 1 2016-01-01 5 1
1 1 2016-01-03 15 4
2 2 2016-01-02 9 2
3 2 2016-01-04 20 29
EDIT1:
您可以过滤转换为元组的列表理解中的值:
df1['userdate'] = pd.to_datetime(df1['userdate'])
print (df1)
id userdate
0 1 2016-01-01
1 1 2016-01-03
2 2 2016-01-04
3 2 2016-01-02
a = [tuple(x) for x in df1.values]
print (a)
[(1, Timestamp('2016-01-01 00:00:00')), (1, Timestamp('2016-01-03 00:00:00')),
(2, Timestamp('2016-01-04 00:00:00')), (2, Timestamp('2016-01-02 00:00:00'))]
L = [(i, d, s1, s2) for i, d1, d2, s1, s2 in df.values
for d in pd.date_range(d1, d2)
if (i, d) in a]
df = (pd.DataFrame(L, columns=['id','userdate','score1','score2'])
.groupby(['id','userdate'], as_index=False)
.sum())
print (df)
id userdate score1 score2
0 1 2016-01-01 5 1
1 1 2016-01-03 15 4
2 2 2016-01-02 9 2
3 2 2016-01-04 20 29
答案 1 :(得分:1)
import datetime
import pandas as pd
data={'id':[1,1,1,1,2,2,2,2],
'date1':[datetime.date(2016,1,1),datetime.date(2016,1,2),datetime.date(2016,1,2),datetime.date(2016,1,4),
datetime.date(2016,1,2),datetime.date(2016,1,4),datetime.date(2016,1,3),datetime.date(2016,1,1)],
'date2':[datetime.date(2016,1,5),datetime.date(2016,1,3),datetime.date(2016,1,5),datetime.date(2016,1,5),
datetime.date(2016,1,4),datetime.date(2016,1,5),datetime.date(2016,1,4),datetime.date(2016,1,1)],
'score1':[5,7,3,2,9,3,8,3],
'score2':[1,3,0,5,2,20,7,7]}
df=pd.DataFrame.from_dict(data)
data={'id':[1,1,2,2],
'date':[datetime.date(2016,1,1),datetime.date(2016,1,2),datetime.date(2016,1,2),datetime.date(2016,1,4)]}
df1=pd.DataFrame.from_dict(data)
data1=[]
x=[]
def agg(df,df1):
for i in range(1,2):
x=list(df1.id)
y=list(df1.date)
data1= df[df.id==x[i]]
data2=data1[data1.date1 > y[i]]
data3=data2[data1.date2 < y[i]]
data4=data3.groupby(['id']).agg({"score1":sum})
x.append(data4)
return data4
agg(df,df1)
请尝试这个