Question

df1：

Id      Country       P_Type      Sales
102     Portugal      Industries  1265
163     Portugal      Office      1455
111     Portugal      Clubs       1265
164     Portugal      cars        1751
109     India         House_hold  1651
104     India         Office      1125
124     India         Bakery      1752
112     India         House_hold  1259
105     Germany       Industries  1451
103     Germany       Office      1635
103     Germany       Clubs       1520
103     Germany       cars        1265

df2：

Id    Market        Products    Expenditure
123  Portugal ALL   Wine        5642
136  Portugal St    Wine        4568
158  India QA       Housing     4529
168  India stm      Housing     1576
749  Germany all     Sports      4587
759  Germany sts     Sports      4756

输出df：

Id      Country       P_Type      Sales
102     Portugal      Industries  1265
102     Portugal ALL  Wine        5642
102     Portugal St   Wine        4568
163     Portugal      Office      1455
111     Portugal      Clubs       1265
164     Portugal      cars        1751
109     India         House_hold  1651
109     India QA      Housing     4529
109     India stm     Housing     1576
104     India         Office      1125
124     India         Bakery      1752
112     India         House_hold  1259
105     Germany       Industries  1451
105     Germany all    Sports      4587
105     Germany sts    Sports      4756
103     Germany       Office      1635
103     Germany       Clubs       1520
103     Germany       cars        1265

我需要附加两个数据帧，但df2中的行应附加在df1中的特定位置。例如在df2中，前两行“市场”列属于葡萄牙，在我的df1中国家/地区葡萄牙的第一行ID为102，应该在葡萄牙的第一行后加上相同的ID。其他国家也是如此。

Answer 1

我想我可以通过创建一个伪排序键来做到这一点：

df1['sortkey'] = df1['Country'].duplicated()
df2 = df2.set_axis(df1.columns[:-1], axis=1)

df1['sortkey'] = df1['Country'].duplicated().replace({True:2, False:0})
df_sorted = (pd.concat([df1, df2.assign(sortkey=1)])
               .sort_values(['Country', 'sortkey'], 
                            key=lambda x: x.astype(str).str.split(' ').str[0]))

df_sorted['Id'] = df_sorted.groupby(df_sorted['Country'].str.split(' ').str[0])['Id'].transform('first')
print(df_sorted.drop('sortkey', axis=1))

输出：

     Id       Country      P_Type  Sales
8   105       Germany  Industries   1451
4   105   Germany all      Sports   4587
5   105   Germany sts      Sports   4756
9   105       Germany      Office   1635
10  105       Germany       Clubs   1520
11  105       Germany        cars   1265
4   109         India  House_hold   1651
2   109      India QA     Housing   4529
3   109     India stm     Housing   1576
5   109         India      Office   1125
6   109         India      Bakery   1752
7   109         India  House_hold   1259
0   102      Portugal  Industries   1265
0   102  Portugal ALL        Wine   5642
1   102   Portugal St        Wine   4568
1   102      Portugal      Office   1455
2   102      Portugal       Clubs   1265
3   102      Portugal        cars   1751

注意： 在key方法中使用带有sort_values参数的熊猫1.1.0

Answer 2

from itertools import chain

#ensure the columns match for both dataframes
df1.columns = df.columns
#the Id from the first dataframe takes precedence, so we convert
#the Id in df1 to null
df1.Id = np.nan


#here we iterate through the group for df
#we get the first row for each group
#get the rows from df1 for that particular group
#then the rows from 1 to the end for df
#flatten the data using itertools' chain
#concatenate the data, fill down on the null values in the Id column
merger = ((
        value.iloc[[0]],
        df1.loc[df1.Country.str.split().str[0].isin(value.Country)],
        value.iloc[1:])
    for key, value in df.groupby("Country", sort=False).__iter__())

merger = chain.from_iterable(merger)
merger = pd.concat(merger, ignore_index=True).ffill().astype({"Id": "Int16"})

merger.head()

    Id  Country       P_Type    Sales
0   102 Portugal    Industries  1265
1   102 Portugal ALL    Wine    5642
2   102 Portugal St     Wine    4568
3   163 Portugal        Office  1455
4   111 Portugal        Clubs   1265

Answer 3

df2.rename(columns = {'Market':'Country','Products':'P_Type','Expenditure':'Sales'}, inplace = True) 

 
def Insert_row(row_number, df, row_value): 
    # Starting value of upper half 
    start_upper = 0
   
    # End value of upper half 
    end_upper = row_number 
   
    # Start value of lower half 
    start_lower = row_number 
   
    # End value of lower half 
    end_lower = df.shape[0] 
   
    # Create a list of upper_half index 
    upper_half = [*range(start_upper, end_upper, 1)] 
   
    # Create a list of lower_half index 
    lower_half = [*range(start_lower, end_lower, 1)] 
   
    # Increment the value of lower half by 1 
    lower_half = [x.__add__(1) for x in lower_half] 
   
    # Combine the two lists 
    index_ = upper_half + lower_half 
   
    # Update the index of the dataframe 
    df.index = index_ 
   
    # Insert a row at the end 
    df.loc[row_number] = row_value 
    
    # Sort the index labels 
    df = df.sort_index() 
   
    # return the dataframe 
    return df 




def proper_plc(index_2):
    index_1 =0
    for ids1 in df1.Country:
#         print(ids1 in ids)

        if ids1 in ids:
            break
        index_1+=1

    abc = list(df2.loc[index_2])
    abc[0] = list(df1.loc[index_1])[0]
    return Insert_row(index_1+1,df1,abc )
    

index_2=0
for ids in df2.Country:
    
    df1 =proper_plc(index_2)
    index_2+=1

如何在熊猫中追加两个数据框

3 个答案: