问题:
我目前有一个从昨天开始收集文件的过程,将其与今天的文件进行比较,并删除所有尚未更改的值。我们的想法是将上传到数据库的数据量限制为仅已更改且当前不在数据库中的数据。
我最近介绍了函数combine_first,我意识到我可以快速获得更新的合并文件,但我不确定是否有办法找到未更改的值。
数据:
data = {
"inventory number": {
"1236": "110",
"5188": "101",
"19497": "111",
"5123": "010",
"27358": "011"
},
"cost": {
"1236": 20.80,
"5188": 28.86,
"19497": 112.69,
"5123": 165.03,
"27358": 54.02
},
"map": {
"1236": "True",
"5188": "True",
"19497": "True",
"5123": "True",
"27358": "False"
},
"cat": {
"1236": "CONSUMABLE",
"5188": "ELECTRONICS",
"19497": "POWER TOOL",
"5123": "POWER TOOL",
"27358": "APPLIANCES"
}
}
dest = pd.DataFrame(data=data)
data = {
"inventory number": {
"1236": "110",
"5188": "101",
"19497": "111",
"5123": "010",
"27358": "011"
},
"cost": {
"1236": 21.80,
"5188": 33.86,
"19497": 100.69,
"5123": 169.03,
"27358": 49.99
},
"map": {
"1236": "True",
"5188": "True",
"19497": "True",
"5123": "False",
"27358": "False"
},
"cat": {
"1236": "CONSUMABLE",
"5188": "ELECTRONICS",
"19497": "Electronics",
"5123": "POWER TOOL",
"27358": "Home/Kitchen"
}
}
source = pd.DataFrame(data=data)
追加并找到重复的解决方案,产生所需的输出:
icol = 'inventory number'
combined = source.append(dest)
combined.dropna(axis=0, subset=[icol], inplace=True)
i = combined.groupby(icol).cumcount()
transposed = combined.set_index([icol,i]).unstack(0).T
cleaned = transposed[transposed[0]!=transposed[1]].unstack(0)[0].reset_index()
cleaned.fillna(values=np.nan, inplace=True)
期望的输出:
data = {
"inventory number": {
"0": "010",
"1": "011",
"2": "101",
"3": "110",
"4": "111"
},
"cat": {
"0": np.nan,
"1": "Home\\/Kitchen",
"2": np.nan,
"3": np.nan,
"4": "Electronics"
},
"cost": {
"0": 169.03,
"1": 49.99,
"2": 33.86,
"3": 21.8,
"4": 100.69
},
"map": {
"0": "False",
"1": np.nan,
"2": np.nan,
"3": np.nan,
"4": np.nan
}
}
desired_output = pd.DataFrame(data=data)
结合第一个解决方案:
cf = source.combine_first(dest)
data = {"cat":{"1236":"CONSUMABLE","19497":"Electronics","27358":"Home\\/Kitchen","5123":"POWER TOOL","5188":"ELECTRONICS"},"cost":{"1236":21.8,"19497":100.69,"27358":49.99,"5123":169.03,"5188":33.86},"inventory number":{"1236":"110","19497":"111","27358":"011","5123":"010","5188":"101"},"map":{"1236":"True","19497":"True","27358":"False","5123":"False","5188":"True"}}
combine_first = pd.DataFrame(data=data)
答案 0 :(得分:1)
您不再需要combine_first
,只需比较并查看更改内容。
r = source[~(source == dest)]
r['inventory number'] = source['inventory number']
print(r)
cat cost inventory number map
1236 NaN 21.80 110 NaN
19497 Electronics 100.69 111 NaN
27358 Home/Kitchen 49.99 011 NaN
5123 NaN 169.03 010 False
5188 NaN 33.86 101 NaN