在Python中比较相同CSV文件的两个版本-出现错误

时间:2018-10-19 13:57:19

标签: python python-3.x pandas

我正在尝试创建一个Python脚本,以帮助我和我的同事比较同一.csv文件的版本。

到目前为止,这是我的代码:

import pandas as pd

file1 = (r'C:\Jarek S\results_old.csv')
file2 = (r'C:\Jarek S\results_new.csv')
file3 = (r'C:\Jarek S\results_diff.csv')

cols_to_show = ['emp_id', 'Onbr', 'Measure', 'Status', 'Start', 'End', 'SM', 'Rev',
   'PY', 'table', 'terr_id', 'first_nm', 'last_nm', 'job_cd',
   'job_title_nm', 'ctry_cd', 'District', 'Fac_nm']

old = pd.read_csv(file1)
new = pd.read_csv(file2)


def report_diff(x):
return x[0] if x[1] == x[0] else '{0} --> {1}'.format(*x)

#old.reset_index(inplace=True, drop=True)
#new.reset_index(inplace=True, drop=True)

old['version'] = 'old'
new['version'] = 'new'

full_set = pd.concat([old, new], ignore_index=True)

changes = full_set.drop_duplicates(subset=cols_to_show, keep='last')

dupe_emp_ids = changes.set_index('emp_id').index.get_duplicates()

dupes = changes[changes['emp_id'].isin(dupe_emp_ids)]

change_new = dupes[(dupes['version'] == 'new')]
change_old = dupes[(dupes['version'] == 'old')]

change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)

change_new.set_index('emp_id', inplace=True)
change_old.set_index('emp_id', inplace=True)

diff_panel = pd.Panel(dict(df1=change_old, df2=change_new))
diff_output = diff_panel.apply(report_diff, axis=0)

changes['duplicate'] = changes['emp_id'].isin(dupe_emp_ids)
removed_emp_ids = changes[(changes['duplicate'] == False) & (changes['version'] == 'old')]
removed_emp_ids.set_index('emp_id', inplace=True)

new_emp_id_set = full_set.drop_duplicates(subset=cols_to_show)

new_emp_id_set['duplicate'] = new_emp_id_set['emp_id'].isin(dupe_emp_ids)

added_emp_ids = new_emp_id_set[(new_emp_id_set['duplicate'] == False) & (new_emp_id_set['version'] == 'new')]
added_emp_ids.set_index('emp_id', inplace=True)

df = pd.concat([diff_output, removed_emp_ids, added_emp_ids], keys=('changed', 'removed', 'added'))
df[cols_to_show].to_csv(file3)

我在这里想要做的是使用emp_nbr列作为参考来比较文件。不幸的是,我是python脚本的新手,唯一得到的是以下错误:

---------------------------------------------------------------------------
InvalidIndexError                         Traceback (most recent call last)
<ipython-input-23-7c071d4996a5> in <module>()
----> 1 diff_panel = pd.Panel(dict(df1=change_old, df2=change_new))

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in __init__(self, data, items, major_axis, minor_axis, copy, dtype)
152 
153         self._init_data(data=data, items=items, major_axis=major_axis,
--> 154                         minor_axis=minor_axis, copy=copy, dtype=dtype)
155 
156     def _init_data(self, data, copy, dtype, **kwargs):

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _init_data(self, data, copy, dtype, **kwargs)
177             mgr = data
178         elif isinstance(data, dict):
--> 179             mgr = self._init_dict(data, passed_axes, dtype=dtype)
180             copy = False
181             dtype = None

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _init_dict(self, data, axes, dtype)
214         # extract axis for remaining axes & create the slicemap
215         raxes = [self._extract_axis(self, data, axis=i) if a is None else a
--> 216                  for i, a in enumerate(axes)]
217         raxes_sm = self._extract_axes_for_slice(self, raxes)
218 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in <listcomp>(.0)
214         # extract axis for remaining axes & create the slicemap
215         raxes = [self._extract_axis(self, data, axis=i) if a is None else a
--> 216                  for i, a in enumerate(axes)]
217         raxes_sm = self._extract_axes_for_slice(self, raxes)
218 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\panel.py in _extract_axis(self, data, axis, intersect)
1504             # 2. the indices are not aligned.
1505             index = _get_objs_combined_axis(data.values(), axis=axis,
-> 1506                                             intersect=intersect, sort=None)
1507 
1508         if have_raw_arrays:

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _get_objs_combined_axis(objs, intersect, axis, sort)
 52                  if hasattr(obj, '_get_axis')]
 53     if obs_idxes:
---> 54         return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
 55 
 56 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _get_combined_index(indexes, intersect, sort)
 67             index = index.intersection(other)
 68     else:
---> 69         index = _union_indexes(indexes, sort=sort)
 70         index = _ensure_index(index)
 71 

 ~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\api.py in _union_indexes(indexes, sort)
105         else:
106             for other in indexes[1:]:
--> 107                 result = result.union(other)
108             return result
109     elif kind == 'array':

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in union(self, other)
2766                 result.extend([x for x in rvals if x not in value_set])
2767         else:
-> 2768             indexer = self.get_indexer(other)
2769             indexer, = (indexer == -1).nonzero()
2770 

~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_indexer(self, target, method, limit, tolerance)
3227 
3228         if not self.is_unique:
 -> 3229             raise InvalidIndexError('Reindexing only valid with uniquely'
3230                                     ' valued Index objects')
3231 

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

我尝试了一些修改,但这对我没有任何帮助。您能帮我解决以下错误吗?还是至少指向可以帮助我创建一个脚本的方向,该脚本将比较两个.csv文件并将结果保存在第三个文件中?

谢谢!

1 个答案:

答案 0 :(得分:0)

我建议在Linux或等效于Windows的Windows(What is the Windows equivalent of the diff command?)上使用“ diff”来实现此结果。仅使用已实现的系统级功能并将文件作为参数传递给它们,应该会容易得多。