我正在读取.csv
文件,从两个值创建一个复合键,然后将值汇总到一行文本中。如果有超过六个条目具有相同的复合键,我需要在文件中创建一个新行。我目前有一种相当复杂的方式,我正在寻找关于如何简化和改进这个过程的建议。这是我的代码,我总是愿意接受有关如何改进的其他建议,或者由于我对编程很新的事实而使用的模块。
import csv
import sys
def main():
with open(sys.argv[1], 'rbU') as csvinfile:
myreader = csv.reader(csvinfile,
delimiter=',',
quotechar='"')
# initialize variables and dictionaries
line_cntr = 0
dict_ = {}
cntr = {}
# capture header row
header_row = myreader.next()
for row in myreader:
# constant values
ND_AMT = 14
DEDUCT_AMT = 15
NONDEDUCT_YTD = 16
DEDUCT_YTD = 17
PID = 18
DON_DATE = 19
AMOUNT = 23
ANON = 25
INT_0003 = 38
INT_0006 = 39
INT_0028 = 40
# variables
nd_amt = row[ND_AMT]
deduct_amt = row[DEDUCT_AMT]
nondeduct_ytd = row[NONDEDUCT_YTD]
deduct_ytd = row[DEDUCT_YTD]
pid = row[PID]
don_date = row[DON_DATE]
amount = row[AMOUNT]
anon = row[ANON]
int_0003 = row[INT_0003]
int_0006 = row[INT_0006]
int_0028 = row[INT_0028]
# create a composite key for dict_
key = ':'.join((pid, don_date))
# check to see if key exists in dictionary, if not then add
# as per BR-0010 every group of up to 6 entries with same P_ID and Don_Date should print on their own line (i.e. different entry in Dict).
if key in dict_:
if cntr[key] % 6:
# add use of constants to simplify the dict_[keys]
dict_[key][14] += nd_amt
dict_[key][15] += deduct_amt
dict_[key][16] += nondeduct_ytd
dict_[key][17] += deduct_ytd
dict_[key][23] += amount
cntr[key] += 1
else:
key = ':'.join((pid, don_date, str(cntr[key]//6)))
dict_[key] = row
# print("Doing some concatenating funkiness here!")
else:
dict_[key] = row
cntr[key] = 1
# debugging
for key, value in cntr.iteritems():
if value > 6:
print(key, value)
print("Warning, list entry greater than 6 found.")
# keep track of lines processed for recon
line_cntr += 1
with open(sys.argv[2], 'wb') as csvoutfile:
mywriter = csv.writer(csvoutfile,
delimiter=',',
quotechar='"')
mywriter.writerow(header_row)
for key, value in dict_.iteritems():
out_line = (key, value)
mywriter.writerow(out_line)
# print recon totals at bottom
csvoutfile.write("Total lines processed: %d" % (line_cntr))
if __name__ == '__main__':
main()
这是一行清理输入:
100029-00001,,100,100,400,100,175,100,700,200,200,500,0,0,0,300,0,2575,105999,10/27/14,197999,23764962,"Frank, David",300,1004,N,N,N,D,Jim and Sharon,Jim and Sharon Grossman,712 81st St NE,,,Logan,ID,54002-0000,UNITED STATES,,,,,Y,jimpagel@gmail.com,,(999) 999-9999,"Frank, David",2289B,BASIC,,
和一行输出(不是来自同一记录):
11576999:10/28/13,"['100029-00001', '', '750', '750', '750', '750', '0', '950', '1700', '750', '750', '750', '0', '0', '0000', '750750750750', '0000', '7900790079007900', '11576999', '10/28/14', '197999', '137307', 'WHERE NEEDED MOST', '256255050', '1009', 'N', 'N', 'N', 'D', 'Dirk and Bonnie', 'Dirk and Bonnie Johnson', '9999 Woodlawn Dr', '', '', 'Douglasville', 'VA', '30000-1999', 'UNITED STATES', '', '', '', '', 'Y', 'dirk.johnson@yahoo.com', '', '(229) 336-4442', 'WHERE NEEDED MOST', '2289B', 'BASIC', '', '']"