我正在尝试创建一个Python脚本,以帮助我和我的同事比较同一.csv文件的版本。
到目前为止,这是我的代码:
import pandas as pd
file1 = (r'C:\Jarek S\results_old.csv')
file2 = (r'C:\Jarek S\results_new.csv')
file3 = (r'C:\Jarek S\results_diff.csv')
cols_to_show = ['emp_id', 'Onbr', 'Measure', 'Status', 'Start', 'End', 'SM', 'Rev',
'PY', 'table', 'terr_id', 'first_nm', 'last_nm', 'job_cd',
'job_title_nm', 'ctry_cd', 'District', 'Fac_nm']
old = pd.read_csv(file1)
new = pd.read_csv(file2)
def report_diff(x):
return x[0] if x[1] == x[0] else '{0} --> {1}'.format(*x)
#old.reset_index(inplace=True, drop=True)
#new.reset_index(inplace=True, drop=True)
old['version'] = 'old'
new['version'] = 'new'
full_set = pd.concat([old, new], ignore_index=True)
changes = full_set.drop_duplicates(subset=cols_to_show, keep='last')
dupe_emp_ids = changes.set_index('emp_id').index.get_duplicates()
dupes = changes[changes['emp_id'].isin(dupe_emp_ids)]
change_new = dupes[(dupes['version'] == 'new')]
change_old = dupes[(dupes['version'] == 'old')]
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)
change_new.set_index('emp_id', inplace=True)
change_old.set_index('emp_id', inplace=True)
diff_panel = pd.Panel(dict(df1=change_old, df2=change_new))
diff_output = diff_panel.apply(report_diff, axis=0)
changes['duplicate'] = changes['emp_id'].isin(dupe_emp_ids)
removed_emp_ids = changes[(changes['duplicate'] == False) & (changes['version'] == 'old')]
removed_emp_ids.set_index('emp_id', inplace=True)
new_emp_id_set = full_set.drop_duplicates(subset=cols_to_show)
new_emp_id_set['duplicate'] = new_emp_id_set['emp_id'].isin(dupe_emp_ids)
added_emp_ids = new_emp_id_set[(new_emp_id_set['duplicate'] == False) & (new_emp_id_set['version'] == 'new')]
added_emp_ids.set_index('emp_id', inplace=True)
df = pd.concat([diff_output, removed_emp_ids, added_emp_ids], keys=('changed', 'removed', 'added'))
df[cols_to_show].to_csv(file3)
我在这里想要做的是使用emp_nbr列作为参考来比较文件。不幸的是,我是python脚本的新手,唯一得到的是以下错误:
---------------------------------------------------------------------------
InvalidIndexError Traceback (most recent call last)
<ipython-input-23-7c071d4996a5> in <module>()
----> 1 diff_panel = pd.Panel(dict(df1=change_old, df2=change_new))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in __init__(self, data, items, major_axis, minor_axis, copy, dtype)
152
153 self._init_data(data=data, items=items, major_axis=major_axis,
--> 154 minor_axis=minor_axis, copy=copy, dtype=dtype)
155
156 def _init_data(self, data, copy, dtype, **kwargs):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _init_data(self, data, copy, dtype, **kwargs)
177 mgr = data
178 elif isinstance(data, dict):
--> 179 mgr = self._init_dict(data, passed_axes, dtype=dtype)
180 copy = False
181 dtype = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _init_dict(self, data, axes, dtype)
214 # extract axis for remaining axes & create the slicemap
215 raxes = [self._extract_axis(self, data, axis=i) if a is None else a
--> 216 for i, a in enumerate(axes)]
217 raxes_sm = self._extract_axes_for_slice(self, raxes)
218
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in <listcomp>(.0)
214 # extract axis for remaining axes & create the slicemap
215 raxes = [self._extract_axis(self, data, axis=i) if a is None else a
--> 216 for i, a in enumerate(axes)]
217 raxes_sm = self._extract_axes_for_slice(self, raxes)
218
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _extract_axis(self, data, axis, intersect)
1504 # 2. the indices are not aligned.
1505 index = _get_objs_combined_axis(data.values(), axis=axis,
-> 1506 intersect=intersect, sort=None)
1507
1508 if have_raw_arrays:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _get_objs_combined_axis(objs, intersect, axis, sort)
52 if hasattr(obj, '_get_axis')]
53 if obs_idxes:
---> 54 return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
55
56
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _get_combined_index(indexes, intersect, sort)
67 index = index.intersection(other)
68 else:
---> 69 index = _union_indexes(indexes, sort=sort)
70 index = _ensure_index(index)
71
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _union_indexes(indexes, sort)
105 else:
106 for other in indexes[1:]:
--> 107 result = result.union(other)
108 return result
109 elif kind == 'array':
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in union(self, other)
2766 result.extend([x for x in rvals if x not in value_set])
2767 else:
-> 2768 indexer = self.get_indexer(other)
2769 indexer, = (indexer == -1).nonzero()
2770
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_indexer(self, target, method, limit, tolerance)
3227
3228 if not self.is_unique:
-> 3229 raise InvalidIndexError('Reindexing only valid with uniquely'
3230 ' valued Index objects')
3231
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
我尝试了一些修改,但这对我没有任何帮助。您能帮我解决以下错误吗?还是至少指向可以帮助我创建一个脚本的方向,该脚本将比较两个.csv文件并将结果保存在第三个文件中?
谢谢!
答案 0 :(得分:0)
我建议在Linux或等效于Windows的Windows(What is the Windows equivalent of the diff command?)上使用“ diff”来实现此结果。仅使用已实现的系统级功能并将文件作为参数传递给它们,应该会容易得多。