我正在尝试根据条件“联接”两个DataFrame。
条件
if df1.Year == df2.Year &
df1.Date >= df2.BeginDate or df1.Date <= df2.EndDate &
df1.ID == df2.ID
#if the condition is True, I would love to add an extra column (binary) to df1, something like
#df1.condition = Yes or No.
我的数据如下:
df1:
Year Week ID Date
2020 1 123 2020-01-01 00:00:00
2020 1 345 2020-01-01 00:00:00
2020 2 123 2020-01-07 00:00:00
2020 1 123 2020-01-01 00:00:00
df2:
Year BeginDate EndDate ID
2020 2020-01-01 00:00:00 2020-01-02 00:00:00 123
2020 2020-01-01 00:00:00 2020-01-02 00:00:00 123
2020 2020-01-01 00:00:00 2020-01-02 00:00:00 978
2020 2020-09-21 00:00:00 2020-01-02 00:00:00 978
end_df: #Expected output
Year Week ID Condition
2020 1 123 True #Year is matching, week1 is between the dates, ID is matching too
2019 1 345 False #Year is not matching
2020 2 187 False # ID is not matching
2020 1 123 True # Same as first row.
我想通过循环两个DataFrame来解决此问题:
for row in df1.iterrrows():
for row2 in df2.iterrows():
if row['Year'] == row2['Year2']:
if row['ID] == row2['ID']:
.....
.....
row['Condition'] = True
else:
row['Condition'] = False
但是...这导致一个接一个的错误。
真的很期待你们将如何解决这个问题。提前谢谢了!
更新1
我创建了一个循环。但是,此循环需要很长时间(而且我不确定如何将值添加到新列中)。
请注意,在df1中,我创建了一个“日期”列(格式与df2中的开始日期和结束日期相同)。
立即输入:如何将True值(在循环末尾..)添加到df1(在额外的列中)?
for index, row in df1.interrows():
row['Year'] = str(row['Year'])
for index1, row1 in df2.iterrows():
row1['Year'] = str(row1['Year'])
if row['Year'] == row1['Year']:
row['ID'] = str(row['ID'])
row1['ID'] = str(row1['ID'])
if row['ID] == row1['ID']:
if row['Date'] >= row1['BeginDate'] and row['Date'] <= row1['Enddate']:
print("I would like to add this YES to df1 in an extra column")
编辑2
尝试@davidbilla解决方案:看来“条件”列的运行状况不佳。如您所见,即使df1.Year!= df2.Year,它也会匹配。请注意,df2是根据ID排序的(因此所有相同的唯一数字都应在其中
答案 0 :(得分:4)
我想您期望这样的事情-如果您尝试按行匹配数据帧(即,将df1的row1与df2的row1进行比较):
override func viewWillAppear(_ animated: Bool) {
super.viewWillAppear(true)
let db = Firestore.firestore()
db.collection("main_collection").document("document_one")
.addSnapshotListener { documentSnapshot, error in
guard let document = documentSnapshot else {
print("Error fetching document: \(error!)")
return
}
let property = document.get("field_one")
self.label.text = property as? String
guard let data = document.data() else {
print("Document data was empty.")
return
}
print("Current data: \(data)")
}
}
override func viewWillDisappear(_ animated: Bool) {
super.viewWillDisappear(true)
}
np.where将条件作为第一个参数,如果条件通过,则第二个参数将是值,如果条件失败,则第三个参数将是值。
编辑1: 根据您的样本数据集
df1['condition'] = np.where((df1['Year']==df2['Year'])&(df1['ID']==df2['ID'])&((df1['Date']>=df2['BeginDate'])or(df1['Date']<=df2['EndDate'])), True, False)
输出:
df1 = pd.DataFrame([[2020,1,123],[2020,1,345],[2020,2,123],[2020,1,123]],
columns=['Year','Week','ID'])
df2 = pd.DataFrame([[2020,'2020-01-01 00:00:00','2020-01-02 00:00:00',123],
[2020,'2020-01-01 00:00:00','2020-01-02 00:00:00',123],
[2020,'2020-01-01 00:00:00','2020-01-02 00:00:00',978],
[2020,'2020-09-21 00:00:00','2020-01-02 00:00:00',978]],
columns=['Year','BeginDate','EndDate','ID'])
df2['BeginDate'] = pd.to_datetime(df2['BeginDate'])
df2['EndDate'] = pd.to_datetime(df2['EndDate'])
df1['condition'] = np.where((df1['Year']==df2['Year'])&(df1['ID']==df2['ID']),True, False)
# &((df1['Date']>=df2['BeginDate'])or(df1['Date']<=df2['EndDate'])) - removed this condition as the df has no Date field
print(df1)
编辑2:要将df1中的一行与df2中的所有行进行比较
Year Date ID condition
0 2020 1 123 True
1 2020 1 345 False
2 2020 2 123 False
3 2020 1 123 False
这需要df1['condition'] = (df1['Year'].isin(df2['Year']))&(df1['ID'].isin(df2['ID']))
并将其与df1['Year']
的所有值进行比较。
基于示例数据集:
df1:
df2['Year']
df2:
Year Date ID
0 2020 2020-01-01 123
1 2020 2020-01-01 345
2 2020 2020-10-01 123
3 2020 2020-11-13 123
代码更改:
Year BeginDate EndDate ID
0 2020 2020-01-01 2020-02-01 123
1 2020 2020-01-01 2020-01-02 123
2 2020 2020-03-01 2020-05-01 978
3 2020 2020-09-21 2020-10-01 978
输出:
date_range = list(zip(df2['BeginDate'],df2['EndDate']))
def check_date(date):
for (s,e) in date_range:
if date>=s and date<=e:
return True
return False
df1['condition'] = (df1['Year'].isin(df2['Year']))&(df1['ID'].isin(df2['ID']))
df1['date_compare'] = df1['Date'].apply(lambda x: check_date(x)) # you can directly store this in df1['condition']. I just wanted to print the values so have used a new field
df1['condition'] = (df1['condition']==True)&(df1['date_compare']==True)
编辑3: 基于更新的问题(以前我认为如果3个值的year,id和date与不在同一行的任何行中的df2匹配都可以)。我想我现在对您的要求有了更好的了解。
Year Date ID condition date_compare
0 2020 2020-01-01 123 True True # Year match, ID match and Date is within the range of df2 row 1
1 2020 2020-01-01 345 False True # Year match, ID no match
2 2020 2020-10-01 123 True True # Year match, ID match, Date is within range of df2 row 4
3 2020 2020-11-13 123 False False # Year match, ID match, but Date is not in range of any row in df2
输出-设置1:
DF1:
df2['BeginDate'] = pd.to_datetime(df2['BeginDate'])
df2['EndDate'] = pd.to_datetime(df2['EndDate'])
df1['Date'] = pd.to_datetime(df1['Date'])
df1['condition'] = False
for idx1, row1 in df1.iterrows():
match = False
for idx2, row2 in df2.iterrows():
if (row1['Year']==row2['Year']) & \
(row1['ID']==row2['ID']) & \
(row1['Date']>=row2['BeginDate']) & \
(row1['Date']<=row2['EndDate']):
match = True
df1.at[idx1, 'condition'] = match
DF2:
Year Date ID
0 2020 2020-01-01 123
1 2020 2020-01-01 123
2 2020 2020-01-01 345
3 2020 2020-01-10 123
4 2020 2020-11-13 123
DF1结果:
Year BeginDate EndDate ID
0 2020 2020-01-15 2020-02-01 123
1 2020 2020-01-01 2020-01-02 123
2 2020 2020-03-01 2020-05-01 978
3 2020 2020-09-21 2020-10-01 978
输出-设置2: DF1:
Year Date ID condition
0 2020 2020-01-01 123 True
1 2020 2020-01-01 123 True
2 2020 2020-01-01 345 False
3 2020 2020-01-10 123 False
4 2020 2020-11-13 123 False
DF2:
Year Date ID
0 2019 2019-01-01 s904112
1 2019 2019-01-01 s911243
2 2019 2019-01-01 s917131
3 2019 2019-01-01 sp986214
4 2019 2019-01-01 s510006
5 2020 2020-01-10 s540006
DF1结果:
Year BeginDate EndDate ID
0 2020 2020-01-27 2020-09-02 s904112
1 2020 2020-01-27 2020-09-02 s904112
2 2020 2020-01-03 2020-03-15 s904112
3 2020 2020-04-15 2020-01-05 s904112
4 2020 2020-01-05 2020-05-15 s540006
5 2019 2019-01-05 2019-05-15 s904112
答案 1 :(得分:2)
所需输出的第二行具有Year
作为2019
,因此我假设df1.Year
的第二行也是2019
而不是2020
< / p>
如果我理解正确,则需要合并和过滤掉Date
和BeginDate
范围之外的EndDate
。首先,df2
中存在重复项和无效的日期范围。在合并之前,我们需要删除重复项和无效范围。无效的日期范围是BeginDate
> = EndDate
,即df2
的索引3的范围。
#convert all date columns of both `df1` and `df2` to datetime dtype
df1['Date'] = pd.to_datetime(df1['Date'])
df2[['BeginDate', 'EndDate']] = df2[['BeginDate', 'EndDate']].apply(pd.to_datetime)
#left-merge on `Year`, `ID` and using `eval` to compute
#columns `Condition` where `Date` is between `BeginDate` and `EndDate`.
#Finally assign back to `df1`
df1['Condition'] = (df1.merge(df2.loc[df2.BeginDate < df2.EndDate].drop_duplicates(),
on=['Year','ID'], how='left')
.eval('Condition= BeginDate <= Date <= EndDate')['Condition'])
Out[614]:
Year Week ID Date Condition
0 2020 1 123 2020-01-01 True
1 2019 1 345 2020-01-01 False
2 2020 2 123 2020-01-07 False
3 2020 1 123 2020-01-01 True