Question

我有两个数据帧，并想将一个数据帧中的数据分配给另一个。我能想到的唯一方法是嵌套的for循环。对于大型数据集，此方法将花费很长时间。我想知道是否有人对提高性能有任何想法。我的代码如下所示：

import pandas as pd
import numpy as np
def make_merge(df1,df2):
    df3=pd.DataFrame(columns=["ID","Depot","Allocated Amt1"])
    for id1,amt1 in list(zip(df1["ID"],df1["Amt1"])):
        found=False
        amt = amt1
        for id2,Depot,amt2 in list (zip(df2["ID"],df2["Depot"],df2["Amt2"])): 
            if ( id1==id2):
                found = True
                if amt- amt2 > 0:
                    try:
                        df3.loc[-1]=[id1,Depot,amt2]
                        df3.index = df3.index+1
                        df3=df3.sort_index()
                    except:
                        df3.loc[-1]=[id1,np.nan,amt2]
                        df3.index = df3.index+1
                        df3=df3.sort_index()
                    amt-=amt2
                else:
                    try:
                        df3.loc[-1]=[id1,Depot,amt]
                        df3.index = df3.index +1
                        df3=df3.sort_index()
                    except:
                        df3.loc[-1]=[id1,np.nan,amt]
                        df3.index = df3.index +1
                        df3=df3.sort_index()
                    amt=0
        if amt >0:
            if found:
                df3.loc[-1]=[id1,"not fully allocated",amt]
                df3.index=df3.index+1
                df3=df3.sort_index()
    return(df3.sort_values(by="ID"))

输入和输出看起来像这样：

df1= pd.DataFrame({'ID':['A','B'],'Amt1':[10,20]})
df2= pd.DataFrame({'ID':['A','B','A'],'Depot':['DTC','BNY','BNY'],'Amt2':[5,10,5]})
c=make_merge(df1,df2)
c

输入和输出表如下所示：

Answer 1

import pandas as pd

df1 = pd.DataFrame({'ID': ['A', 'B'], 'Amt1': [10, 20]})
df2 = pd.DataFrame({'ID': ['A', 'B', 'A'], 'Depot': ['DTC', 'BNY', 'BNY'], 'Amt2': [5, 10, 5]})

# calculate allocated
df_a = pd.merge(df1, df2, how="right", on="ID")
df_a["Allocated Amt1"] = df_a["Amt2"]
df_a = df_a[["ID", "Depot", "Allocated Amt1"]]

# calulate not allocated
df_totals = df2.groupby("ID").sum()
df_na = pd.merge(df1, df_totals, on="ID")
df_na["Allocated Amt1"] = df_na["Amt1"] - df_na["Amt2"]
df_na = df_na[df_na["Allocated Amt1"] > 0]
df_na["Depot"] = "not fully allocated"
df_na = df_na[["ID", "Depot", "Allocated Amt1"]]

# concatenate
df_final = pd.concat([df_a, df_na], axis=0)
print(df_final)

嵌套for循环和PANDAS的替代方法

1 个答案: