假设我有下表
import pandas as pd, datetime
table = [[datetime.datetime(2015, 1, 1), 1],
[datetime.datetime(2015, 1, 27), 1],
[datetime.datetime(2015, 1, 31), 1],
[datetime.datetime(2015, 2, 1), 1],
[datetime.datetime(2015, 2, 3), 1],
[datetime.datetime(2015, 2, 15), 1],
[datetime.datetime(2015, 2, 28), 1],
[datetime.datetime(2015, 3, 1), 1],
[datetime.datetime(2015, 3, 17), 1],
[datetime.datetime(2015, 3, 28), 1],
[datetime.datetime(2015, 4, 12), 1],
[datetime.datetime(2015, 4, 28), 1]]
df1 = pd.DataFrame(table, columns=['Date', 'Id'])
df2 = df1.copy()
df2['Id'] = 2
df = df1.append(df2)
table2 = [[1, datetime.datetime(1900, 1, 1), datetime.datetime(2015, 2, 28), 2, 20],
[1, datetime.datetime(2015, 3, 1), datetime.datetime(3000, 1, 1), 4, 25],
[2, datetime.datetime(1900, 1, 1), datetime.datetime(3000, 1, 1), 2, 20]]
df3 = pd.DataFrame(table2, columns=['Id', 'Start', 'End', 'Fix', 'Performance'])
修改在df3
中,该表格在Id
上分组。即前两行对Id = 1
有效,最后一行对Id = 2
有效。
我现在的问题是;有没有办法将Fix
和Performance
作为列添加到df
,以便相应列的元素位于Start
和End
所在的行上有效,由Date
确定?这意味着我的表格看起来像
Date Id Fix Performance
0 2015-01-01 1 2 20
1 2015-01-27 1 2 20
2 2015-01-31 1 2 20
3 2015-02-01 1 2 20
4 2015-02-03 1 2 20
5 2015-02-15 1 2 20
6 2015-02-28 1 2 20
7 2015-03-01 1 4 25
8 2015-03-17 1 4 25
9 2015-03-28 1 4 25
10 2015-04-12 1 4 25
11 2015-04-28 1 4 25
0 2015-01-01 2 2 20
1 2015-01-27 2 2 20
2 2015-01-31 2 2 20
3 2015-02-01 2 2 20
4 2015-02-03 2 2 20
5 2015-02-15 2 2 20
6 2015-02-28 2 2 20
7 2015-03-01 2 2 20
8 2015-03-17 2 2 20
9 2015-03-28 2 2 20
10 2015-04-12 2 2 20
11 2015-04-28 2 2 20
谢谢,Tingis
答案 0 :(得分:1)
这是一种方法,您可以逐行apply
一个函数来生成两个想要的列:
import pandas as pd
import numpy as np
def search(x):
df_ = df3[df3.Id==x['Id']]
mask = np.logical_and(df_.Start<=x['Date'], df_.End>=x['Date'])
return pd.Series([df_.loc[mask].Fix.tolist()[0], df_.loc[mask].Performance.tolist()[0]])
df[['Fix','Performance']] = df.apply(search, axis=1)
In [423]: df
Out[423]:
Date Id Fix Performance
0 2015-01-01 1 2 20
1 2015-01-27 1 2 20
2 2015-01-31 1 2 20
3 2015-02-01 1 2 20
4 2015-02-03 1 2 20
5 2015-02-15 1 2 20
6 2015-02-28 1 2 20
7 2015-03-01 1 4 25
8 2015-03-17 1 4 25
9 2015-03-28 1 4 25
10 2015-04-12 1 4 25
11 2015-04-28 1 4 25
0 2015-01-01 2 2 20
1 2015-01-27 2 2 20
2 2015-01-31 2 2 20
3 2015-02-01 2 2 20
4 2015-02-03 2 2 20
5 2015-02-15 2 2 20
6 2015-02-28 2 2 20
7 2015-03-01 2 2 20
8 2015-03-17 2 2 20
9 2015-03-28 2 2 20
10 2015-04-12 2 2 20
11 2015-04-28 2 2 20
答案 1 :(得分:1)
您可以先执行SQL样式outer merge
,然后删除那些Date
落在Start-to-End
间隔之外的不一致记录。
import pandas as pd
import numpy as np
import datetime
# your data
# ========================================================
table = [[datetime.datetime(2015, 1, 1), 1],
[datetime.datetime(2015, 1, 27), 1],
[datetime.datetime(2015, 1, 31), 1],
[datetime.datetime(2015, 2, 1), 1],
[datetime.datetime(2015, 2, 3), 1],
[datetime.datetime(2015, 2, 15), 1],
[datetime.datetime(2015, 2, 28), 1],
[datetime.datetime(2015, 3, 1), 1],
[datetime.datetime(2015, 3, 17), 1],
[datetime.datetime(2015, 3, 28), 1],
[datetime.datetime(2015, 4, 12), 1],
[datetime.datetime(2015, 4, 28), 1]]
df1 = pd.DataFrame(table, columns=['Date', 'Id'])
df2 = df1.copy()
df2['Id'] = 2
df = df1.append(df2)
print(df)
Date Id
0 2015-01-01 1
1 2015-01-27 1
2 2015-01-31 1
3 2015-02-01 1
4 2015-02-03 1
5 2015-02-15 1
6 2015-02-28 1
7 2015-03-01 1
.. ... ..
4 2015-02-03 2
5 2015-02-15 2
6 2015-02-28 2
7 2015-03-01 2
8 2015-03-17 2
9 2015-03-28 2
10 2015-04-12 2
11 2015-04-28 2
table2 = [[1, datetime.datetime(1900, 1, 1), datetime.datetime(2015, 2, 28), 2, 20],
[1, datetime.datetime(2015, 3, 1), datetime.datetime(2030, 1, 1), 4, 25],
[2, datetime.datetime(1900, 1, 1), datetime.datetime(2030, 1, 1), 2, 20]]
df3 = pd.DataFrame(table2, columns=['Id', 'Start', 'End', 'Fix', 'Performance'])
print(df3)
Id Start End Fix Performance
0 1 1900-01-01 2015-02-28 2 20
1 1 2015-03-01 2030-01-01 4 25
2 2 1900-01-01 2030-01-01 2 20
# processing
# =============================================
df_temp = pd.merge(df, df3, on='Id', how='outer')
result = df_temp[(df_temp.Date >= df_temp.Start) & (df_temp.Date <= df_temp.End)].reset_index(drop=True)
Date Id Start End Fix Performance
0 2015-01-01 1 1900-01-01 2015-02-28 2 20
1 2015-01-27 1 1900-01-01 2015-02-28 2 20
2 2015-01-31 1 1900-01-01 2015-02-28 2 20
3 2015-02-01 1 1900-01-01 2015-02-28 2 20
4 2015-02-03 1 1900-01-01 2015-02-28 2 20
5 2015-02-15 1 1900-01-01 2015-02-28 2 20
6 2015-02-28 1 1900-01-01 2015-02-28 2 20
7 2015-03-01 1 2015-03-01 2030-01-01 4 25
8 2015-03-17 1 2015-03-01 2030-01-01 4 25
9 2015-03-28 1 2015-03-01 2030-01-01 4 25
10 2015-04-12 1 2015-03-01 2030-01-01 4 25
11 2015-04-28 1 2015-03-01 2030-01-01 4 25
12 2015-01-01 2 1900-01-01 2030-01-01 2 20
13 2015-01-27 2 1900-01-01 2030-01-01 2 20
14 2015-01-31 2 1900-01-01 2030-01-01 2 20
15 2015-02-01 2 1900-01-01 2030-01-01 2 20
16 2015-02-03 2 1900-01-01 2030-01-01 2 20
17 2015-02-15 2 1900-01-01 2030-01-01 2 20
18 2015-02-28 2 1900-01-01 2030-01-01 2 20
19 2015-03-01 2 1900-01-01 2030-01-01 2 20
20 2015-03-17 2 1900-01-01 2030-01-01 2 20
21 2015-03-28 2 1900-01-01 2030-01-01 2 20
22 2015-04-12 2 1900-01-01 2030-01-01 2 20
23 2015-04-28 2 1900-01-01 2030-01-01 2 20
# if you don't like Start and End columns in the final table, just drop them
result.drop(['Start', 'End'], axis=1, inplace=True)
答案 2 :(得分:0)
根据我的经验,与间隔合并时,reindex
和ffill
的组合性能要比使用apply
和outer merge
的解决方案好几个数量级。 / p>
这可能不是最优雅的解决方案,但有时使用apply
或outer merge
的速度过慢或占用过多空间。在那种情况下(我认为这是大多数情况),reindex
和ffill
很好用,但是您需要将带有开始和结束日期的“间隔”表更改为某种“事件” ”表格,其中必须包含新值的开始日期:
import pandas as pd, datetime
table = [[datetime.datetime(2015, 1, 1), 1],
[datetime.datetime(2015, 1, 27), 1],
[datetime.datetime(2015, 1, 31), 1],
[datetime.datetime(2015, 2, 1), 1],
[datetime.datetime(2015, 2, 3), 1],
[datetime.datetime(2015, 2, 15), 1],
[datetime.datetime(2015, 2, 28), 1],
[datetime.datetime(2015, 3, 1), 1],
[datetime.datetime(2015, 3, 17), 1],
[datetime.datetime(2015, 3, 28), 1],
[datetime.datetime(2015, 4, 12), 1],
[datetime.datetime(2015, 4, 28), 1]]
df1 = pd.DataFrame(table, columns=['Date', 'Id'])
df2 = df1.copy()
df2['Id'] = 2
df = df1.append(df2).reset_index()
table3 = [[1, datetime.datetime(1900, 1, 1), 2, 20],
[1, datetime.datetime(2015, 3, 1), 4, 25],
[2, datetime.datetime(1900, 1, 1), 2, 20]]
df3 = pd.DataFrame(table3, columns=['Id', 'Start', 'Fix', 'Performance'])
df3 = df3.set_index(['Id', 'Start'])
df_index = df.set_index(['Id', 'Date']).index
df3 = df3.reindex(df3.index.union(df_index))
df3 = df3.sort_index(level=[0, 1]).ffill().reindex(df_index).astype(int)
df[['Fix','Performance']] = df3.reset_index(drop=True)