我有两个数据帧,并想将一个数据帧中的数据分配给另一个。我能想到的唯一方法是嵌套的for循环。对于大型数据集,此方法将花费很长时间。我想知道是否有人对提高性能有任何想法。我的代码如下所示:
import pandas as pd
import numpy as np
def make_merge(df1,df2):
df3=pd.DataFrame(columns=["ID","Depot","Allocated Amt1"])
for id1,amt1 in list(zip(df1["ID"],df1["Amt1"])):
found=False
amt = amt1
for id2,Depot,amt2 in list (zip(df2["ID"],df2["Depot"],df2["Amt2"])):
if ( id1==id2):
found = True
if amt- amt2 > 0:
try:
df3.loc[-1]=[id1,Depot,amt2]
df3.index = df3.index+1
df3=df3.sort_index()
except:
df3.loc[-1]=[id1,np.nan,amt2]
df3.index = df3.index+1
df3=df3.sort_index()
amt-=amt2
else:
try:
df3.loc[-1]=[id1,Depot,amt]
df3.index = df3.index +1
df3=df3.sort_index()
except:
df3.loc[-1]=[id1,np.nan,amt]
df3.index = df3.index +1
df3=df3.sort_index()
amt=0
if amt >0:
if found:
df3.loc[-1]=[id1,"not fully allocated",amt]
df3.index=df3.index+1
df3=df3.sort_index()
return(df3.sort_values(by="ID"))
输入和输出看起来像这样:
df1= pd.DataFrame({'ID':['A','B'],'Amt1':[10,20]})
df2= pd.DataFrame({'ID':['A','B','A'],'Depot':['DTC','BNY','BNY'],'Amt2':[5,10,5]})
c=make_merge(df1,df2)
c
答案 0 :(得分:1)
import pandas as pd
df1 = pd.DataFrame({'ID': ['A', 'B'], 'Amt1': [10, 20]})
df2 = pd.DataFrame({'ID': ['A', 'B', 'A'], 'Depot': ['DTC', 'BNY', 'BNY'], 'Amt2': [5, 10, 5]})
# calculate allocated
df_a = pd.merge(df1, df2, how="right", on="ID")
df_a["Allocated Amt1"] = df_a["Amt2"]
df_a = df_a[["ID", "Depot", "Allocated Amt1"]]
# calulate not allocated
df_totals = df2.groupby("ID").sum()
df_na = pd.merge(df1, df_totals, on="ID")
df_na["Allocated Amt1"] = df_na["Amt1"] - df_na["Amt2"]
df_na = df_na[df_na["Allocated Amt1"] > 0]
df_na["Depot"] = "not fully allocated"
df_na = df_na[["ID", "Depot", "Allocated Amt1"]]
# concatenate
df_final = pd.concat([df_a, df_na], axis=0)
print(df_final)