只有外连接python pandas

时间:2017-11-07 20:25:45

标签: python pandas dataframe outer-join





def query_to_df(query):
    df_a = pd.DataFrame(data_a)
    df_b = pd.DataFrame(data_b)
    outer_results = pd.concat([df_a, df_b], axis=1, join='outer')
    return df


df_a = 
col_a  col_b  col_c
   a1     b1     c1
   a2     b2     c2

df_b = 
col_a  col_b  col_c
   a2     b2     c2
   a3     b3     c3

# they only share the 2nd row:    a2     b2     c2 
# so the outer result should be:
col_a  col_b  col_c  col_a  col_b  col_c
   a1     b1     c1     NA     NA     NA
   NA     NA     NA     a3     b3     c3

最后,您会注意到result_1 = col_a col_b col_c a1 b1 c1 result_2 = col_a col_b col_c a3 b3 c3 被排除,因为所有列都匹配 - 如何根据所有列指定我想要加入,而不只是1?如果a2 b2 c2df_a我也希望该行也在a2 foo c2

3 个答案:

答案 0 :(得分:4)

mergeindicator参数一起使用,outer先加入,然后按queryboolean indexing进行过滤:

df = df_a.merge(df_b, how='outer', indicator=True)
print (df)
  col_a col_b col_c      _merge
0    a1    b1    c1   left_only
1    a2    b2    c2        both
2    a3    b3    c3  right_only

a = df.query('_merge == "left_only"').drop('_merge', 1)
print (a)
  col_a col_b col_c
0    a1    b1    c1

b = df.query('_merge == "right_only"').drop('_merge', 1)
print (b)
  col_a col_b col_c
2    a3    b3    c3


a = df[df['_merge'] == "left_only"].drop('_merge', 1)
print (a)
  col_a col_b col_c
0    a1    b1    c1

b = df[df['_merge'] == "right_only"].drop('_merge', 1)
print (b)
  col_a col_b col_c
2    a3    b3    c3

答案 1 :(得分:4)



  col_a col_b col_c
0    a1    b1    c1
1    a3    b3    c3

您甚至可以使用pd.concat keys参数来提供行所在的上下文。

pd.concat([df_a, df_b], keys=['a', 'b']).drop_duplicates(keep=False)

    col_a col_b col_c
a 0    a1    b1    c1
b 1    a3    b3    c3

答案 2 :(得分:1)

concat和drop_duplicates with keep = False

使用numpy setdiff1

new_df = pd.concat([df_a, df_b]).drop_duplicates(keep=False)

    col_a   col_b   col_c
0   a1      b1      c1
1   a3      b3      c3


df_a = pd.DataFrame(np.setdiff1d(np.array(df_a.values), np.array(df_b.values))\
.reshape(-1, df_a.shape[1]), columns = df_a.columns)

df_b = pd.DataFrame(np.setdiff1d(np.array(df_b.values), np.array(df_a.values))\
.reshape(-1, df_b.shape[1]), columns = df_b.columns)


    col_a   col_b   col_c
0   a1      b1      c1