我是python的新手,这是我上网的示例代码 我有两个大数据CSV文件,一个来自数据库,另一个来自公司元数据。我想比较两个表中的特定列并生成一个新的csv文件,该文件向我显示元数据中缺少记录的位置。请记住,两个csv文件的列数不同,我想分析两个csv文件中的特定列。
这是两个csv文件:
从excel表中复制的csv1
start_time end_time aitechid hh_village grpdetails1/farmername grpdetails1/farmermobile 2016-11-26T14:01:47.329+03 2016-11-26T14:29:05.042+03 AI00001 2447 KahsuGebru 919115604 2016-11-26T19:34:42.159+03 2016-11-26T20:39:27.430+03 936891238 2473 Moto Aleka 914370833 2016-11-26T12:13:23.094+03 2016-11-26T14:25:19.178+03 914127382 2390 Hagos 914039654 2016-11-30T14:31:28.223+03 2016-11-30T14:56:33.144+03 920784222 384 Mohammed Ali 923456788 2016-11-30T14:22:38.631+03 2016-11-30T15:06:44.199+03 912320358 378 Habtamu Nuru 913856087 2016-11-29T03:41:36.532+03 2016-11-29T16:33:12.632+03 914763134 2301 Are gaining Giday 0 2016-11-29T16:21:05.012+03 2016-11-29T16:37:27.934+03 914763134 2290 G 912345678 2016-11-30T17:23:34.145+03 2016-11-30T18:00:32.142+03 914763134 2291 Haile tesfu 0 2016-11-30T20:37:54.657+03 2016-11-30T20:56:16.472+03 914763134 2300 Negative Abay 933082495 2016-11-30T21:00:22.063+03 2016-11-30T21:18:44.478+03 914763134 2291 Niguel Amare 914270455
从excel表中复制的csv2
farmermobile 941807851 946741296 9 920212218 915 939555303 961579437 919961811 100004123 972635273 918166831 961579437
我已经尝试过这段代码,但我没有得到预期的输出:
import csv
def get_key(row):
return row["!Sample_title"], row["!Sample_geo_accession"]
def load_csv(filename):
"""Put csv data into a dict that maps title/geo to the complete row.
"""
d = {}
with open(filename) as f:
for row in csv.DictReader(f, delimiter=","):
key = get_key(row)
assert key not in d
d[key] = row
return d
def diffs(old, new):
yield from added_or_removed("ADDED", new.keys() - old.keys(), new)
yield from added_or_removed("REMOVED", old.keys() - new.keys(), old)
yield from changed(old, new)
def compare_row(key, old, new):
i = -1
for i, line in enumerate(diffs(old, new)):
if not i:
print("/".join(key))
print(" " + line)
if i >= 0:
print()
def added_or_removed(state, keys, d):
items = sorted((key, d[key]) for key in keys)
for key, value in items:
yield "{:10}: {:30} | {:30}".format(state, key, value)
def changed(old, new):
common_columns = old.keys() & new.keys()
for column in sorted(common_columns):
oldvalue = old[column]
newvalue = new[column]
if oldvalue != newvalue:
yield "{:10}: {:30} | {:30} | {:30}".format(
"CHANGED",
column,
oldvalue.ljust(30),
newvalue.ljust(30))
if __name__ == "__main__":
oldcsv = load_csv("/media/dmogaka/DATA/week4/combine201709.csv")
newcsv = load_csv("/media/dmogaka/DATA/week4/combinedmissingrecords.csv")
# title/geo pairs that occur in both files:
common = oldcsv.keys() & newcsv.keys()
for key in sorted(common):
compare_row(key, oldcsv[key], newcsv[key])