Question

几个月后我制作了自动化下一个流程的脚本。

列出文件夹中的.pdf文件列表。
从每个pdf文件中提取数据
在excel表中保存提取的数据

当处理多达15个pdf文件时，脚本工作完美，但如果我尝试更多，则不起作用。我认为在第3个过程中崩溃但我无法确定。

我写了检查点（打印的文件数，提取的打印数据等等）但是为了能够保存不间断的空间数据，我需要把这段代码：

import sys
reload(sys)  
sys.setdefaultencoding('Cp1252')

当我放这行时，我在python shell中看不到任何内容，所以我不知道脚本崩溃的时间。

我想也许可能是关于记忆的事情，但我需要你的帮助。

如果您可以查看我的代码并给我建议，我很高兴

谢谢，

我的所有剧本：

import sys
reload(sys)  
sys.setdefaultencoding('Cp1252')
import os
from glob import glob

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlsxwriter
import time




def find_ext(dr, ext):
    return glob(path.join(dr,"*.{}".format(ext)))  

files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    fstr = ''
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,    password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

        str = retstr.getvalue()
        fstr += str

    fp.close()
    device.close()
    retstr.close()
    return fstr

fecha_de_hoy =(time.strftime("%d/%m/%Y"))
fecha_de_hoy = re.sub("/", "-", fecha_de_hoy)

# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('Expenses.xlsx')
worksheet = workbook.add_worksheet()



# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0


# Iterate over the data and write it out row by row.

worksheet.write(row, col, "FECHA")
worksheet.write(row, col + 1, "CLIENTE")
worksheet.write(row, col + 2, "PROVEEDOR" )
worksheet.write(row, col + 3, "REF. CLIENTE" )
worksheet.write(row, col + 4, "REMITENTE")
worksheet.write(row, col + 5, "DESTINATARIO")
worksheet.write(row, col + 6, "DIRECCION DEST.")
worksheet.write(row, col + 7, "CODIGO POSTAL DEST.")
worksheet.write(row, col + 8, "POBLACION DEST.")
worksheet.write(row, col + 9, "PROVINCIA DEST.")
worksheet.write(row, col + 10, "Nº BULTOS")
worksheet.write(row, col + 11, "PESO")
worksheet.write(row, col + 12, "COSTE")
worksheet.write(row, col + 13, "PVP")
worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN")

row+=1

e = len(files)
lengthlist = e
w=0
print e

while w < lengthlist:

    print w
    print files[w]

    factura = files[w]

    string = convert_pdf_to_txt(factura)


    txtList = convert_pdf_to_txt(factura).splitlines()
    destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1

    for idx, line in enumerate(txtList):
        if line == "Destino MercancÃa":
            destinatarioIdx = idx +1
            direcionNumIdx = idx +2
            codigoNumIdx = idx +3  
            poblacionIdx = idx +3
            provinciaIdx = idx +4


        if line == "NÂº de Pedido":
            pedidoIdx = idx +1

        if "Bultos" in line:
            bultosIdx = idx + 2



    nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else ''
    nombre_destinatario = re.sub("Ã‰", "É", nombre_destinatario)
    direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else ''
    codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else ''
    codigo_destinatario = re.sub("\D", "", codigo_destinatario)
    poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else ''
    poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario)
    poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE)
    provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else ''
    pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else ''
    bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else ''
    bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE)

    #ARREGLAR EXCEPCIONES

    '''for idx, line in enumerate(txtList):
        if line == "Destino MercancÃa":
            destinatarioIdx = idx +1
            direcionNumIdx = idx +2
            codigoNumIdx = idx +3

            if codigoNumIdx < 1000:
                direcion1 = idx +2
                direccion2 = idx +3
                direcionNumIdx = (direcion1, direccion2)
                codigoNumIdx = idx +4
                poblacionIdx = idx +4
                provinciaIdx = idx +5'''

    print "Nombre Destinatario"
    print nombre_destinatario
    print "Direccion destinatario"
    print direccion_destinatario
    print "codigo destinatario"
    print codigo_destinatario
    print "poblacion destinatario"
    print poblacion_destinatario
    print "Provincia destinatario"
    print provincia_destinatario
    print "Nº pedido destinatario"
    print pedido_destinatario
    print "Nº bultos envío"
    print bultos_destinatario

    # Iterate over the data and write it out row by row.

    worksheet.write(row, col, fecha_de_hoy)
    worksheet.write(row, col + 1, "SIDAC")
    worksheet.write(row, col + 2, "PROVEEDOR" )
    worksheet.write(row, col + 3, pedido_destinatario )
    worksheet.write(row, col + 4, "SIDAC")
    worksheet.write(row, col + 5, nombre_destinatario)
    worksheet.write(row, col + 6, direccion_destinatario)
    worksheet.write(row, col + 7, codigo_destinatario)
    worksheet.write(row, col + 8, poblacion_destinatario)
    worksheet.write(row, col + 9, provincia_destinatario)
    worksheet.write(row, col + 10, bultos_destinatario)
    worksheet.write(row, col + 11, "PESO")
    worksheet.write(row, col + 12, "COSTE")
    worksheet.write(row, col + 13, "PVP")
    worksheet.write(row, col + 14, "trafico@buendialogistica.com")




    w+=1
    row+=1


workbook.close()

从PDF中提取数据并导出到Excel

0 个答案: