我需要合并以下2个数据框:
df1:
A B C D F
0 1 a zz 10 11
1 1 a zz 15 11
2 2 b yy 20 12
3 3 c xx 30 13
4 4 d ww 40 14
5 5 e vv 50 15
6 6 f uu 60 16
7 7 g NaN 70 17
8 8 h ss 80 18
9 9 NaN rr 90 19
10 13 m nn 130 113
11 15 o ll 150 115
df2:
A B C D G
0 1 NaN zz 15 100
1 6 f uu 60 600
2 7 g tt 70 700
3 10 j qq 100 1000
4 12 l NaN 120 1200
5 14 n NaN 140 1400
合并的数据框应为:
A B C D F G
0 1 a zz 10 11 None
1 1 a zz 15 11 100
2 2 b yy 20 12 None
3 3 c xx 30 13 None
4 4 d ww 40 14 None
5 5 e vv 50 15 None
6 6 f uu 60 16 600
7 7 g tt 70 17 700
8 8 h ss 80 18 None
9 9 NaN rr 90 19 None
10 13 m nn 130 113 None
11 15 o ll 150 115 None
12 10 j qq 100 None 1000
13 12 l NaN 120 None 1200
14 14 n NaN 140 None 1400
以下是生成df1和df2的代码:
df1 = pd.DataFrame({'A': [1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 15],
'B': ['a', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', np.NAN, 'm', 'o'],
'C': ['zz', 'zz', 'yy', 'xx', 'ww', 'vv', 'uu', np.NAN, 'ss', 'rr', 'nn', 'll'],
'D': [10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 130, 150],
'F': [11, 11, 12, 13, 14, 15, 16, 17, 18, 19, 113, 115]})
df2 = pd.DataFrame({'A': [1, 6, 7, 10, 12, 14],
'B': [np.NAN, 'f', 'g', 'j', 'l', 'n'],
'C': ['zz', 'uu', 'tt', 'qq', np.NAN, np.NAN],
'D': [15, 60, 70, 100, 120, 140],
'G': [100, 600, 700, 1000, 1200, 1400]})
我尝试了以下方法:
md1 = df1.merge(df2, how='outer')
md2 = df1.merge(df2, how='outer', on=['A', 'D'])
md3 = df1.merge(df2, how='outer', left_on=['A', 'D'], right_on=['A', 'D'])
md4 = df1.merge(df2, how='outer', left_on=['A', 'B', 'C', 'D'], right_on=['A', 'B', 'C', 'D'])
以下是md1和md4的结果(相同的结果):
print(md1.to_string())
A B C D F G
0 1 a zz 10 11.0 NaN
1 1 a zz 15 11.0 NaN
2 2 b yy 20 12.0 NaN
3 3 c xx 30 13.0 NaN
4 4 d ww 40 14.0 NaN
5 5 e vv 50 15.0 NaN
6 6 f uu 60 16.0 600.0
7 7 g NaN 70 17.0 NaN
8 8 h ss 80 18.0 NaN
9 9 NaN rr 90 19.0 NaN
10 13 m nn 130 113.0 NaN
11 15 o ll 150 115.0 NaN
12 1 NaN zz 15 NaN 100.0
13 7 g tt 70 NaN 700.0
14 10 j qq 100 NaN 1000.0
15 12 l NaN 120 NaN 1200.0
16 14 n NaN 140 NaN 1400.0
以下是md2和md3的结果(相同的结果):
print(md2.to_string())
A B_x C_x D F B_y C_y G
0 1 a zz 10 11.0 NaN NaN NaN
1 1 a zz 15 11.0 NaN zz 100.0
2 2 b yy 20 12.0 NaN NaN NaN
3 3 c xx 30 13.0 NaN NaN NaN
4 4 d ww 40 14.0 NaN NaN NaN
5 5 e vv 50 15.0 NaN NaN NaN
6 6 f uu 60 16.0 f uu 600.0
7 7 g NaN 70 17.0 g tt 700.0
8 8 h ss 80 18.0 NaN NaN NaN
9 9 NaN rr 90 19.0 NaN NaN NaN
10 13 m nn 130 113.0 NaN NaN NaN
11 15 o ll 150 115.0 NaN NaN NaN
12 10 NaN NaN 100 NaN j qq 1000.0
13 12 NaN NaN 120 NaN l NaN 1200.0
14 14 NaN NaN 140 NaN n NaN 1400.0
但是上述结果都不是我从合并操作中所需要的!
所以,我写了一个函数来获取我想要的东西:
def merge_df(d1, d2, on_columns):
d1_row_count = d1.shape[0]
d2_row_count = d2.shape[0]
d1_columns = list(d1.columns)
d2_columns = list(d2.columns)
extra_columns_in_d1 = []
extra_columns_in_d2 = []
common_columns = []
for c in d1_columns:
if c not in d2_columns:
extra_columns_in_d1.append(c)
else:
common_columns.append(c)
for c in d2_columns:
if c not in d1_columns:
extra_columns_in_d2.append(c)
print(common_columns)
# start with the merged dataframe equal to d1
md = d1.copy(deep=True)
# Append the extra columns to md (with None values in the newly appended columns)
for c in extra_columns_in_d2:
md[c] = [None] * d1_row_count
d1_new_row_number = d1_row_count
# iterate thru each row in d2
for i in range(d2_row_count):
# create the match query string
d1_match_condition = ''
for p, c in enumerate(on_columns):
d1_match_condition += c + ' == ' + str(d2.loc[i, c])
if p < (len(on_columns) - 1):
d1_match_condition += ' and '
match_in_d1 = d1.query(d1_match_condition)
# if match is not found, then append the row
if match_in_d1.shape[0] == 0:
# build a list representing the row to append
row_list = []
for c in common_columns:
row_list.append(d2.loc[i, c])
for c in extra_columns_in_d1:
row_list.append(None)
for c in extra_columns_in_d2:
row_list.append(d2.loc[i, c])
md.loc[d1_new_row_number] = row_list
d1_new_row_number += 1
# if match is found, then modify the found row
else:
match_in_d1_index = list(match_in_d1.index)[0]
for c in common_columns:
if (md.loc[match_in_d1_index, c]) is None or (md.loc[match_in_d1_index, c]) is np.NAN:
md.loc[match_in_d1_index, c] = d2.loc[i, c]
for c in extra_columns_in_d2:
md.loc[match_in_d1_index, c] = d2.loc[i, c]
return md
使用此功能时,将获得所需的合并数据帧:
md5 = merge_df(df1, df2, ['A', 'D'])
我是否缺少内置数据框合并方法的一些基本知识以获得所需结果?
答案 0 :(得分:1)
合并操作时格式错误。尝试以下代码
result = df1.merge(df2,on=['A','D'], how='outer')
尝试
df1 = df1.merge(df2,on=['A','D'],how='outer')
df1['C'] = df1[['C_x','C_y']].apply(lambda x: x['C_y'] if x['C_x'] is np.nan else x['C_x'],axis=1)
df1['B'] = df1[['B_x','B_y']].apply(lambda x: x['B_y'] if x['B_x'] is np.nan else x['B_x'],axis=1)
df1 = df1.drop(labels=['B_x','B_y','C_x','C_y'],axis=1)
答案 1 :(得分:1)
您可以先合并,然后使用.assing
和.combine_first
。合并的结果列需要正确合并在一起,方法是取右df
的值,并用左df
更新其值,在此特定点有一个条目。 .combine_first
就是这样做的。
m = pd.merge(df1, df2, on=['A','D'], how='outer')
m.assign(B=m['B_x'].combine_first(m['B_y']), C=m['C_x'].combine_first(m['C_y']))\
.drop(['B_x','C_x','B_y','C_y'], 1)[['A','B','C','D','F','G']]
结果
A B C D F G
0 1 a zz 10 11.0 NaN
1 1 a zz 15 11.0 100.0
2 2 b yy 20 12.0 NaN
3 3 c xx 30 13.0 NaN
4 4 d ww 40 14.0 NaN
5 5 e vv 50 15.0 NaN
6 6 f uu 60 16.0 600.0
7 7 g tt 70 17.0 700.0
8 8 h ss 80 18.0 NaN
9 9 NaN rr 90 19.0 NaN
10 13 m nn 130 113.0 NaN
11 15 o ll 150 115.0 NaN
12 10 j qq 100 NaN 1000.0
13 12 l NaN 120 NaN 1200.0
14 14 n NaN 140 NaN 1400.0