如何使用字典以6个为一组从.csv文件中汇总行?

时间:2015-07-29 20:07:08

标签: python python-2.7 csv

我正在读取.csv文件,从两个值创建一个复合键,然后将值汇总到一行文本中。如果有超过六个条目具有相同的复合键,我需要在文件中创建一个新行。我目前有一种相当复杂的方式,我正在寻找关于如何简化和改进这个过程的建议。这是我的代码,我总是愿意接受有关如何改进的其他建议,或者由于我对编程很新的事实而使用的模块。

import csv
import sys

def main():
    with open(sys.argv[1], 'rbU') as csvinfile:
        myreader = csv.reader(csvinfile, 
                              delimiter=',', 
                              quotechar='"')

    # initialize variables and dictionaries
    line_cntr = 0
    dict_ = {}
    cntr = {}
    # capture header row
    header_row = myreader.next()

    for row in myreader:
        # constant values
        ND_AMT = 14
        DEDUCT_AMT = 15
        NONDEDUCT_YTD = 16
        DEDUCT_YTD = 17
        PID = 18
        DON_DATE = 19
        AMOUNT = 23
        ANON = 25
        INT_0003 = 38
        INT_0006 = 39
        INT_0028 = 40

        # variables
        nd_amt = row[ND_AMT]
        deduct_amt = row[DEDUCT_AMT]
        nondeduct_ytd = row[NONDEDUCT_YTD]
        deduct_ytd = row[DEDUCT_YTD]
        pid = row[PID]
        don_date = row[DON_DATE]
        amount = row[AMOUNT]
        anon = row[ANON]
        int_0003 = row[INT_0003]
        int_0006 = row[INT_0006]
        int_0028 = row[INT_0028]

        # create a composite key for dict_
        key = ':'.join((pid, don_date))
        # check to see if key exists in dictionary, if not then add
        # as per BR-0010 every group of up to 6 entries with same P_ID and Don_Date should print on their own line (i.e. different entry in Dict).  
        if key in dict_:
            if cntr[key] % 6:
                # add use of constants to simplify the dict_[keys]
                dict_[key][14] += nd_amt
                dict_[key][15] += deduct_amt
                dict_[key][16] += nondeduct_ytd
                dict_[key][17] += deduct_ytd
                dict_[key][23] += amount
                cntr[key] += 1
            else:
                key = ':'.join((pid, don_date, str(cntr[key]//6)))
                dict_[key] = row
                # print("Doing some concatenating funkiness here!")
        else:
            dict_[key] = row
            cntr[key] = 1

        # debugging
        for key, value in cntr.iteritems():
            if value > 6:
                print(key, value)
                print("Warning, list entry greater than 6 found.")

        # keep track of lines processed for recon
        line_cntr += 1


with open(sys.argv[2], 'wb') as csvoutfile:
    mywriter = csv.writer(csvoutfile, 
                          delimiter=',',
                          quotechar='"')

    mywriter.writerow(header_row)
    for key, value in dict_.iteritems():
        out_line = (key, value)
        mywriter.writerow(out_line)

    # print recon totals at bottom
    csvoutfile.write("Total lines processed: %d" % (line_cntr))

if __name__ == '__main__':
    main()

这是一行清理输入:

100029-00001,,100,100,400,100,175,100,700,200,200,500,0,0,0,300,0,2575,105999,10/27/14,197999,23764962,"Frank, David",300,1004,N,N,N,D,Jim and Sharon,Jim and Sharon Grossman,712 81st St NE,,,Logan,ID,54002-0000,UNITED STATES,,,,,Y,jimpagel@gmail.com,,(999) 999-9999,"Frank, David",2289B,BASIC,,

和一行输出(不是来自同一记录):

11576999:10/28/13,"['100029-00001', '', '750', '750', '750', '750', '0', '950', '1700', '750', '750', '750', '0', '0', '0000', '750750750750', '0000', '7900790079007900', '11576999', '10/28/14', '197999', '137307', 'WHERE NEEDED MOST', '256255050', '1009', 'N', 'N', 'N', 'D', 'Dirk and Bonnie', 'Dirk and Bonnie Johnson', '9999 Woodlawn Dr', '', '', 'Douglasville', 'VA', '30000-1999', 'UNITED STATES', '', '', '', '', 'Y', 'dirk.johnson@yahoo.com', '', '(229) 336-4442', 'WHERE NEEDED MOST', '2289B', 'BASIC', '', '']"

0 个答案:

没有答案