问题:
我从多个来源获取数据,寻找相同的数据点。我想更新我的主数据框以反映这些补充文件中的可用性。每个补充文件都特定于主数据框中的一个零售商。
数据:
data = {
"sku": {
"0": "123",
"1": "321",
"2": "456",
"3": "678",
"4": "123",
"5": "321",
"6": "456",
"7": "101",
"8": "123",
"9": "101"
},
"retailer": {
"0": "BobStore",
"1": "BobStore",
"2": "BobStore",
"3": "BobStore",
"4": "SamStore",
"5": "SamStore",
"6": "RobStore",
"7": "SamStore",
"8": "RobStore",
"9": "BobStore"
},
"description": {
"0": "Jacket",
"1": "Jacket",
"2": "Jacket",
"3": "Jacket",
"4": "Jacket",
"5": "Jacket",
"6": "Jacket",
"7": "Jacket",
"8": "Jacket",
"9": "Jacket"
},
"price": {
"0": 19.99,
"1": 18.99,
"2": 12.99,
"3": 15.99,
"4": 12.99,
"5": 12.99,
"6": 11.99,
"7": 19.99,
"8": 16.99,
"9": 18.99
},
"shipping": {
"0": 6.99,
"1": 4.99,
"2": 6.99,
"3": 3.99,
"4": 6.99,
"5": 4.99,
"6": 6.99,
"7": 9.99,
"8": 1.99,
"9": 2.99,
},
"availability": {
"0": "True",
"1": "False",
"2": "",
"3": "",
"4": "False",
"5": "True",
"6": "",
"7": "",
"8": "",
"9": "True",
}
}
df = pd.DataFrame(data=data)
data = {
"sku": {
"0": "123",
"1": "101",
"2": "456",
"3": "879",
},
"availability": {
"0": "False",
"1": "True",
"2": "True",
"3": "True",
}
}
bobStore = pd.DataFrame(data=data)
data = {
"sku": {
"0": "123",
"1": "101",
},
"availability": {
"0": "False",
"1": "True",
}
}
samStore = pd.DataFrame(data=data)
尝试解决方案:
df.loc[df['retailer']=='BobStore'].set_index('sku').update(bobStore.set_index('sku'))
df.loc[df['retailer']=='BobStore'].set_index('sku').update(bobStore.set_index('sku'))
期望的输出:
data = {"availability":{"0":"False","1":"False","2":"True","3":"","4":"False","5":"True","6":"","7":"True","8":"","9":"True"},"description":{"0":"Jacket","1":"Jacket","2":"Jacket","3":"Jacket","4":"Jacket","5":"Jacket","6":"Jacket","7":"Jacket","8":"Jacket","9":"Jacket"},"price":{"0":19.99,"1":18.99,"2":12.99,"3":15.99,"4":12.99,"5":12.99,"6":11.99,"7":19.99,"8":16.99,"9":18.99},"retailer":{"0":"BobStore","1":"BobStore","2":"BobStore","3":"BobStore","4":"SamStore","5":"SamStore","6":"RobStore","7":"SamStore","8":"RobStore","9":"BobStore"},"shipping":{"0":6.99,"1":4.99,"2":6.99,"3":3.99,"4":6.99,"5":4.99,"6":6.99,"7":9.99,"8":1.99,"9":2.99},"sku":{"0":"123","1":"321","2":"456","3":"678","4":"123","5":"321","6":"456","7":"101","8":"123","9":"101"}}
do = pd.DataFrame(data=data)
奖励积分:
帮助我理解为什么更新不能像我预期的那样工作?我在使用sku和零售商的多索引时能够切换值,但是当使用布尔掩码选择我想要更新的切片时,我无法切换值。
多指数解决方案:
df.set_index(['sku','retailer'], inplace=True)
bobStore['retailer'] = 'BobStore'
bobStore.set_index(['sku','retailer'],inplace=True)
df.update(bobStore)
ColdSpeed解决方案尝试:
test1 = bobStore[bobStore['sku'].isin(np.intersect1d(bobStore['sku'],df['sku']))].combine_first(df[df['retailer']=='BobStore']).combine_first(df)
这省略了'BobStore'sku 321'可用性'的错误
答案 0 :(得分:1)
无法预测输出,但我认为您可能正在寻找merge
+ combine_first
。
m1 = df.sku.isin(bobStore.sku) & df.retailer.eq('BobStore')
m2 = df.sku.isin(samStore.sku) & df.retailer.eq('SamStore')
i = df[m1].drop('availability', 1)\
.merge(bobStore, on='sku', how='left').set_index(df[m1].index)
j = df[m2].drop('availability', 1)\
.merge(samStore, on='sku', how='left').set_index(df[m2].index)
print (i.combine_first(j).combine_first(df))
availability description price retailer shipping sku
0 False Jacket 19.99 BobStore 6.99 123
1 False Jacket 18.99 BobStore 4.99 321
2 True Jacket 12.99 BobStore 6.99 456
3 Jacket 15.99 BobStore 3.99 678
4 False Jacket 12.99 SamStore 6.99 123
5 True Jacket 12.99 SamStore 4.99 321
6 Jacket 11.99 RobStore 6.99 456
7 True Jacket 19.99 SamStore 9.99 101
8 Jacket 16.99 RobStore 1.99 123
9 True Jacket 18.99 BobStore 2.99 101
答案 1 :(得分:1)
这是通过使用多索引:)(我使用两种方法进行索引切片)
bobStore=bobStore.loc[bobStore.sku.isin(df.loc[df.retailer=='BobStore','sku'])]
samStore=samStore.loc[samStore.sku.isin(df.loc[df.retailer=='SamStore','sku'])]
df=df.reset_index().sort_values(['retailer','sku']).set_index(['retailer','sku'])
idx = pd.IndexSlice
df.loc[(slice('BobStore'),bobStore.sku.tolist()),'availability']=bobStore.sort_values('sku').availability.values
df.loc[idx['SamStore',samStore.sku.tolist()],'availability']=samStore.sort_values('sku').availability.values
df.reset_index().set_index('index').sort_index()
Out[362]:
retailer sku availability description price shipping
index
0 BobStore 123 False Jacket 19.99 6.99
1 BobStore 321 False Jacket 18.99 4.99
2 BobStore 456 True Jacket 12.99 6.99
3 BobStore 678 Jacket 15.99 3.99
4 SamStore 123 False Jacket 12.99 6.99
5 SamStore 321 True Jacket 12.99 4.99
6 RobStore 456 Jacket 11.99 6.99
7 SamStore 101 True Jacket 19.99 9.99
8 RobStore 123 Jacket 16.99 1.99
9 BobStore 101 True Jacket 18.99 2.99
答案 2 :(得分:1)
一个用pd.concat和maping基于新创建的密钥,即
bobStore['col'] = 'BobStore'
samStore['col'] = 'SamStore'
new = pd.concat([bobStore,samStore],0)
x = df[['retailer','sku']].sum(1).map(new.set_index(new[['col','sku']].sum(1))['availability'])
df['availability'] = x.combine_first(df['availability'])
输出:
availability description price retailer shipping sku 0 False Jacket 19.99 BobStore 6.99 123 1 False Jacket 18.99 BobStore 4.99 321 2 True Jacket 12.99 BobStore 6.99 456 3 Jacket 15.99 BobStore 3.99 678 4 False Jacket 12.99 SamStore 6.99 123 5 True Jacket 12.99 SamStore 4.99 321 6 Jacket 11.99 RobStore 6.99 456 7 True Jacket 19.99 SamStore 9.99 101 8 Jacket 16.99 RobStore 1.99 123 9 True Jacket 18.99 BobStore 2.99 101