从PDF中提取数据并导出到Excel

时间:2017-05-14 07:32:12

标签: python excel

几个月后我制作了自动化下一个流程的脚本。

  1. 列出文件夹中的.pdf文件列表。
  2. 从每个pdf文件中提取数据
  3. 在excel表中保存提取的数据
  4. 当处理多达15个pdf文件时,脚本工作完美,但如果我尝试更多,则不起作用。我认为在第3个过程中崩溃但我无法确定。

    我写了检查点(打印的文件数,提取的打印数据等等)但是为了能够保存不间断的空间数据,我需要把这段代码:

    import sys
    reload(sys)  
    sys.setdefaultencoding('Cp1252')
    

    当我放这行时,我在python shell中看不到任何内容,所以我不知道脚本崩溃的时间。

    我想也许可能是关于记忆的事情,但我需要你的帮助。

    如果您可以查看我的代码并给我建议,我很高兴

    谢谢,

    我的所有剧本:

    import sys
    reload(sys)  
    sys.setdefaultencoding('Cp1252')
    import os
    from glob import glob
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    import re
    import xlsxwriter
    import time
    
    
    
    
    def find_ext(dr, ext):
        return glob(path.join(dr,"*.{}".format(ext)))  
    
    files = [f for f in os.listdir('.') if os.path.isfile(f)]
    files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
    
    def convert_pdf_to_txt(path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        fstr = ''
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,    password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
    
            str = retstr.getvalue()
            fstr += str
    
        fp.close()
        device.close()
        retstr.close()
        return fstr
    
    fecha_de_hoy =(time.strftime("%d/%m/%Y"))
    fecha_de_hoy = re.sub("/", "-", fecha_de_hoy)
    
    # Create a workbook and add a worksheet.
    workbook = xlsxwriter.Workbook('Expenses.xlsx')
    worksheet = workbook.add_worksheet()
    
    
    
    # Start from the first cell. Rows and columns are zero indexed.
    row = 0
    col = 0
    
    
    # Iterate over the data and write it out row by row.
    
    worksheet.write(row, col, "FECHA")
    worksheet.write(row, col + 1, "CLIENTE")
    worksheet.write(row, col + 2, "PROVEEDOR" )
    worksheet.write(row, col + 3, "REF. CLIENTE" )
    worksheet.write(row, col + 4, "REMITENTE")
    worksheet.write(row, col + 5, "DESTINATARIO")
    worksheet.write(row, col + 6, "DIRECCION DEST.")
    worksheet.write(row, col + 7, "CODIGO POSTAL DEST.")
    worksheet.write(row, col + 8, "POBLACION DEST.")
    worksheet.write(row, col + 9, "PROVINCIA DEST.")
    worksheet.write(row, col + 10, "Nº BULTOS")
    worksheet.write(row, col + 11, "PESO")
    worksheet.write(row, col + 12, "COSTE")
    worksheet.write(row, col + 13, "PVP")
    worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN")
    
    row+=1
    
    e = len(files)
    lengthlist = e
    w=0
    print e
    
    while w < lengthlist:
    
        print w
        print files[w]
    
        factura = files[w]
    
        string = convert_pdf_to_txt(factura)
    
    
        txtList = convert_pdf_to_txt(factura).splitlines()
        destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1
    
        for idx, line in enumerate(txtList):
            if line == "Destino Mercancía":
                destinatarioIdx = idx +1
                direcionNumIdx = idx +2
                codigoNumIdx = idx +3  
                poblacionIdx = idx +3
                provinciaIdx = idx +4
    
    
            if line == "Nº de Pedido":
                pedidoIdx = idx +1
    
            if "Bultos" in line:
                bultosIdx = idx + 2
    
    
    
        nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else ''
        nombre_destinatario = re.sub("É", "É", nombre_destinatario)
        direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else ''
        codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else ''
        codigo_destinatario = re.sub("\D", "", codigo_destinatario)
        poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else ''
        poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario)
        poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE)
        provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else ''
        pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else ''
        bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else ''
        bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE)
    
        #ARREGLAR EXCEPCIONES
    
        '''for idx, line in enumerate(txtList):
            if line == "Destino Mercancía":
                destinatarioIdx = idx +1
                direcionNumIdx = idx +2
                codigoNumIdx = idx +3
    
                if codigoNumIdx < 1000:
                    direcion1 = idx +2
                    direccion2 = idx +3
                    direcionNumIdx = (direcion1, direccion2)
                    codigoNumIdx = idx +4
                    poblacionIdx = idx +4
                    provinciaIdx = idx +5'''
    
        print "Nombre Destinatario"
        print nombre_destinatario
        print "Direccion destinatario"
        print direccion_destinatario
        print "codigo destinatario"
        print codigo_destinatario
        print "poblacion destinatario"
        print poblacion_destinatario
        print "Provincia destinatario"
        print provincia_destinatario
        print "Nº pedido destinatario"
        print pedido_destinatario
        print "Nº bultos envío"
        print bultos_destinatario
    
        # Iterate over the data and write it out row by row.
    
        worksheet.write(row, col, fecha_de_hoy)
        worksheet.write(row, col + 1, "SIDAC")
        worksheet.write(row, col + 2, "PROVEEDOR" )
        worksheet.write(row, col + 3, pedido_destinatario )
        worksheet.write(row, col + 4, "SIDAC")
        worksheet.write(row, col + 5, nombre_destinatario)
        worksheet.write(row, col + 6, direccion_destinatario)
        worksheet.write(row, col + 7, codigo_destinatario)
        worksheet.write(row, col + 8, poblacion_destinatario)
        worksheet.write(row, col + 9, provincia_destinatario)
        worksheet.write(row, col + 10, bultos_destinatario)
        worksheet.write(row, col + 11, "PESO")
        worksheet.write(row, col + 12, "COSTE")
        worksheet.write(row, col + 13, "PVP")
        worksheet.write(row, col + 14, "trafico@buendialogistica.com")
    
    
    
    
        w+=1
        row+=1
    
    
    workbook.close()
    

0 个答案:

没有答案