需要帮助分割具有不可打印字符的固定宽度字符串

时间:2012-07-19 06:43:47

标签: python unicode csv split fixed-width

我有一个特别讨厌的固定宽度文件可供使用。它没有以我认为的格式编码。简而言之,我正在尝试做各种各样的事情:

  1. 在RPLY01之前略过所有空格。
  2. 删除那些古怪的\ x00 *字符。我已经考虑过删除它们,但无论我尝试过什么,都没有删除星号(*)中包含的字符。
  3. 拆分FDXSPD01 /
  4. 在CHKP01拆分。
  5. 最后,将每个CSV拆分成一个漂亮的列表,并将所有其他废话(RPLY和CHKP)陷入可解析的友好格式。
  6. 这是原始输出:

    �cRPLY01  IREQ    0000011                                                                         
    N00    �9FDXSPD01"CASH","","",10219575.34,0.00,0,"000000000000773"
    �EFDXSPD01"CAD","CANADA DOLLAR","CU",-14564.52,0.00,0,"000000000000773"    
    �PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",3644.00,0.00,0,"000000000000773"
    �QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-3641.07,0.00,0,"000000000000773"
    �PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",1457.00,0.00,0,"000000000000773"
    �QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-1456.43,0.00,0,"000000000000773"
    �DFDXSPD01"CD5427","JOHN CD","CD",100000.00,197.95,0,"000000000000773"
    �LFDXSPD01"CP5427","COMMERCIAL PAPER","CP",9925000.00,0.00,0,"000000000000773"
    �JFDXSPD01"FS5427","JOHNFORSTOCK","FS",81000.00,10000.00,0,"000000000000773"
    �FFDXSPD01"FUT5427","JOHNFUTURE","FT",264000.00,0.00,0,"000000000000773"
    �BFDXSPD01"JKSTOCK","JK STOCK","S",31500.00,0.00,0,"000000000000773"
    �LFDXSPD01"MB5427","JOHN MUNI BOND","M",255000.00,15611.92,0,"000000000000773"
    �QFDXSPD01"MBS5427","JOHNMORTGAGEBACKED","G1",996500.00,2916.67,0,"000000000000773"
    �EFDXSPD01"OPT5427","JOHNOPTION","O",464000.00,0.00,0,"000000000000773"
    �CFDXSPD01"TB5427","TREASURY BILL","TI",0.00,0.00,0,"000000000000773"
    �HFDXSPD01"UB5427","JOHN BOND","G",2994000.00,13281.26,0,"000000000000773"
    �9FDXSPD01"UNITS","UNITS","S",0.00,0.00,0,"000000000000773"
    �CHKP01  N0000000170
    

    这是相同的输出,但通过repr

    \x00cRPLY01  IREQ    0000011
    N00    \x009FDXSPD01"CASH","","",10219575.34,0.00,0,"000000000000773"
    \x00EFDXSPD01"CAD","CANADA DOLLAR","CU",-14564.52,0.00,0,"000000000000773"
    \x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",3644.00,0.00,0,"000000000000773"
    \x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-3641.07,0.00,0,"000000000000773"
    \x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",1457.00,0.00,0,"000000000000773"
    \x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-1456.43,0.00,0,"000000000000773"
    \x00DFDXSPD01"CD5427","JOHN CD","CD",100000.00,197.95,0,"000000000000773"
    \x00LFDXSPD01"CP5427","COMMERCIAL PAPER","CP",9925000.00,0.00,0,"000000000000773"
    \x00JFDXSPD01"FS5427","JOHNFORSTOCK","FS",81000.00,10000.00,0,"000000000000773"
    \x00FFDXSPD01"FUT5427","JOHNFUTURE","FT",264000.00,0.00,0,"000000000000773"
    \x00BFDXSPD01"JKSTOCK","JK STOCK","S",31500.00,0.00,0,"000000000000773"
    \x00LFDXSPD01"MB5427","JOHN MUNI BOND","M",255000.00,15611.92,0,"000000000000773"
    \x00QFDXSPD01"MBS5427","JOHNMORTGAGEBACKED","G1",996500.00,2916.67,0,"000000000000773"
    \x00EFDXSPD01"OPT5427","JOHNOPTION","O",464000.00,0.00,0,"000000000000773"
    \x00CFDXSPD01"TB5427","TREASURY BILL","TI",0.00,0.00,0,"000000000000773"
    \x00HFDXSPD01"UB5427","JOHN BOND","G",2994000.00,13281.26,0,"000000000000773"
    \x009FDXSPD01"UNITS","UNITS","S",0.00,0.00,0,"000000000000773"
    \x00\x13CHKP01  N0000000170'
    

    到目前为止,这是我的思路:

    #!/usr/local/bin/python
    import string
    import struct
    import re
    import array
    
    cols = []
    splitcols = []
    
    def removeNonAscii(s): return "".join(i for i in s if ord(i) < 128)
    rgx = r'[\x00-\x20\x22\x2F\x3A\x3C\x3E\x5C]'
    
    fname2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
    fname = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
    ename2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
    ename = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
    losfile = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
    losfile2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt', 'r')
    
    filename = '/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt'
    filename2 = '/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/error_output.txt'
    
    almost = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
    almost2 = open('/Users/abcd/Documents/SVN/scripts/Python/project/I1DL/output.txt', 'r')
    
    for line in fname.read().split('\t'):
        print repr(line)
        filtered_string = filter(lambda x: x in string.printable, line)
        print "unfilter line: " + line
        print "filtered line: " + filtered_string
        removeNonAscii(line)
    
    print "======================================================================================"
    
    for line in ename.read().split('\t'):
        filtered_string = filter(lambda x: x in string.printable, line)
        print "error unfilter line: " + line
        print "error filtered line: " + filtered_string
    
    print "======================================================================================"
    
    for line in ename2.read().split('\t'):
        for chars in line:
            if chars in string.printable:
                print chars
            else:
                print "!!!!!!!!!!!!!!!"
    
    print "======================================================================================"
    
    #for line in fname2.read().split('\t'):
        #for chars in line:
            #if chars in string.printable:
                #print chars
            #else:
                #print "@@@@@@@@@@@@@@@"
    
    print "======================================================================================"
    print "======================================================================================"
    
    testlist = []
    testlist2 = []
    
    with open(filename) as f:
        readarray = f.readline()
        for eachitem in readarray[6:]:
            if(all(ord(c) < 127 and c in string.printable for c in eachitem)):
                testlist.append(eachitem)
    print repr(''.join(testlist))
    
    with open(filename2) as f:
        readarray2 = f.readline()
        for eachitem in readarray2[6:]:
            testlist2.append(removeNonAscii(eachitem))
    print repr(''.join(testlist2))
    
    with open(filename) as f:
        readarray = f.readline()
        print repr(readarray[6:].split(' FDXSPD01'))
        for eachitem in readarray[6:]:
            if(all(ord(c) < 127 and c in string.printable for c in eachitem)):
                testlist.append(eachitem)
    print repr(''.join(testlist))
    #for eachitem in almost.read():
        #if(all(ord(c) < 127 and c in string.printable for c in eachitem)):
            #testlist.append(eachitem)
    #print ''.join(testlist)
    
    #for eachitem in losfile2.read():
        #testlist2.append(removeNonAscii(eachitem))
    #print ''.join(testlist2)
    

    也许是睡眠不足但我似乎无法找到正确的答案。也许精通Python的人可以指明我的方​​式。

1 个答案:

答案 0 :(得分:1)

好吧,我也有点睡着了,但我觉得咖啡已经开始了。

走过repr()我认为这只是做你想要的,虽然不一定有效或强有力。

import csv, re

data="""
\x00cRPLY01  IREQ    0000011
N00    \x009FDXSPD01"CASH","","",10219575.34,0.00,0,"000000000000773"
\x00EFDXSPD01"CAD","CANADA DOLLAR","CU",-14564.52,0.00,0,"000000000000773"
\x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",3644.00,0.00,0,"000000000000773"
\x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-3641.07,0.00,0,"000000000000773"
\x00PFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",1457.00,0.00,0,"000000000000773"
\x00QFDXSPD01"CCTUSD","CURRENCY CONTRACT - USD","CC",-1456.43,0.00,0,"000000000000773"
\x00DFDXSPD01"CD5427","JOHN CD","CD",100000.00,197.95,0,"000000000000773"
\x00LFDXSPD01"CP5427","COMMERCIAL PAPER","CP",9925000.00,0.00,0,"000000000000773"
\x00JFDXSPD01"FS5427","JOHNFORSTOCK","FS",81000.00,10000.00,0,"000000000000773"
\x00FFDXSPD01"FUT5427","JOHNFUTURE","FT",264000.00,0.00,0,"000000000000773"
\x00BFDXSPD01"JKSTOCK","JK STOCK","S",31500.00,0.00,0,"000000000000773"
\x00LFDXSPD01"MB5427","JOHN MUNI BOND","M",255000.00,15611.92,0,"000000000000773"
\x00QFDXSPD01"MBS5427","JOHNMORTGAGEBACKED","G1",996500.00,2916.67,0,"000000000000773"
\x00EFDXSPD01"OPT5427","JOHNOPTION","O",464000.00,0.00,0,"000000000000773"
\x00CFDXSPD01"TB5427","TREASURY BILL","TI",0.00,0.00,0,"000000000000773"
\x00HFDXSPD01"UB5427","JOHN BOND","G",2994000.00,13281.26,0,"000000000000773"
\x009FDXSPD01"UNITS","UNITS","S",0.00,0.00,0,"000000000000773"
\x00\x13CHKP01  N0000000170'"""

lines = data.split('\x00')
for line in lines:
    try:
        pos = line.index('"')
        print 'ROW:', next(csv.reader([line[pos:]]))
    except ValueError as e:
        try:
            print 'OTH:', line[next(re.finditer('RPLY|CHKP', line)).start():].split()
        except StopIteration as e:
            print 'XXX:', line

<强>打印

OTH: XXX: 

OTH: ['RPLY01', 'IREQ', '0000011', 'N00']
ROW: ['CASH', '', '', '10219575.34', '0.00', '0', '000000000000773']
ROW: ['CAD', 'CANADA DOLLAR', 'CU', '-14564.52', '0.00', '0', '000000000000773']
ROW: ['CCTUSD', 'CURRENCY CONTRACT - USD', 'CC', '3644.00', '0.00', '0', '000000000000773']
...
ROW: ['TB5427', 'TREASURY BILL', 'TI', '0.00', '0.00', '0', '000000000000773']
ROW: ['UB5427', 'JOHN BOND', 'G', '2994000.00', '13281.26', '0', '000000000000773']
ROW: ['UNITS', 'UNITS', 'S', '0.00', '0.00', '0', '000000000000773']
OTH: ['CHKP01', "N0000000170'"]