当处理多达15个pdf文件时,脚本工作完美,但如果我尝试更多,则不起作用。我认为在第3个过程中崩溃但我无法确定。
我写了检查点(打印的文件数,提取的打印数据等等)但是为了能够保存不间断的空间数据,我需要把这段代码:
import sys
reload(sys)
sys.setdefaultencoding('Cp1252')
当我放这行时,我在python shell中看不到任何内容,所以我不知道脚本崩溃的时间。
我想也许可能是关于记忆的事情,但我需要你的帮助。
如果您可以查看我的代码并给我建议,我很高兴
谢谢,
我的所有剧本:
import sys
reload(sys)
sys.setdefaultencoding('Cp1252')
import os
from glob import glob
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlsxwriter
import time
def find_ext(dr, ext):
return glob(path.join(dr,"*.{}".format(ext)))
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
fstr = ''
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
str = retstr.getvalue()
fstr += str
fp.close()
device.close()
retstr.close()
return fstr
fecha_de_hoy =(time.strftime("%d/%m/%Y"))
fecha_de_hoy = re.sub("/", "-", fecha_de_hoy)
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('Expenses.xlsx')
worksheet = workbook.add_worksheet()
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
worksheet.write(row, col, "FECHA")
worksheet.write(row, col + 1, "CLIENTE")
worksheet.write(row, col + 2, "PROVEEDOR" )
worksheet.write(row, col + 3, "REF. CLIENTE" )
worksheet.write(row, col + 4, "REMITENTE")
worksheet.write(row, col + 5, "DESTINATARIO")
worksheet.write(row, col + 6, "DIRECCION DEST.")
worksheet.write(row, col + 7, "CODIGO POSTAL DEST.")
worksheet.write(row, col + 8, "POBLACION DEST.")
worksheet.write(row, col + 9, "PROVINCIA DEST.")
worksheet.write(row, col + 10, "Nº BULTOS")
worksheet.write(row, col + 11, "PESO")
worksheet.write(row, col + 12, "COSTE")
worksheet.write(row, col + 13, "PVP")
worksheet.write(row, col + 14, "E-mail CONFIRMACIÓN")
row+=1
e = len(files)
lengthlist = e
w=0
print e
while w < lengthlist:
print w
print files[w]
factura = files[w]
string = convert_pdf_to_txt(factura)
txtList = convert_pdf_to_txt(factura).splitlines()
destinatarioIdx, direcionNumIdx, codigoNumIdx, poblacionIdx, provinciaIdx, pedidoIdx, bultosIdx = -1, -1, -1, -1, -1, -1, -1
for idx, line in enumerate(txtList):
if line == "Destino MercancÃa":
destinatarioIdx = idx +1
direcionNumIdx = idx +2
codigoNumIdx = idx +3
poblacionIdx = idx +3
provinciaIdx = idx +4
if line == "Nº de Pedido":
pedidoIdx = idx +1
if "Bultos" in line:
bultosIdx = idx + 2
nombre_destinatario = txtList[destinatarioIdx] if destinatarioIdx != -1 else ''
nombre_destinatario = re.sub("É", "É", nombre_destinatario)
direccion_destinatario = txtList[direcionNumIdx] if direcionNumIdx != -1 else ''
codigo_destinatario = txtList[codigoNumIdx] if codigoNumIdx != -1 else ''
codigo_destinatario = re.sub("\D", "", codigo_destinatario)
poblacion_destinatario = txtList[poblacionIdx] if poblacionIdx != -1 else ''
poblacion_destinatario = re.sub("[0-9]", "", poblacion_destinatario)
poblacion_destinatario = re.sub(r"\s+", "", poblacion_destinatario, flags=re.UNICODE)
provincia_destinatario = txtList[provinciaIdx] if provinciaIdx != -1 else ''
pedido_destinatario = txtList[pedidoIdx] if pedidoIdx != -1 else ''
bultos_destinatario = txtList[bultosIdx] if bultosIdx != -1 else ''
bultos_destinatario = re.sub(r"\s+", "", bultos_destinatario, flags=re.UNICODE)
#ARREGLAR EXCEPCIONES
'''for idx, line in enumerate(txtList):
if line == "Destino MercancÃa":
destinatarioIdx = idx +1
direcionNumIdx = idx +2
codigoNumIdx = idx +3
if codigoNumIdx < 1000:
direcion1 = idx +2
direccion2 = idx +3
direcionNumIdx = (direcion1, direccion2)
codigoNumIdx = idx +4
poblacionIdx = idx +4
provinciaIdx = idx +5'''
print "Nombre Destinatario"
print nombre_destinatario
print "Direccion destinatario"
print direccion_destinatario
print "codigo destinatario"
print codigo_destinatario
print "poblacion destinatario"
print poblacion_destinatario
print "Provincia destinatario"
print provincia_destinatario
print "Nº pedido destinatario"
print pedido_destinatario
print "Nº bultos envío"
print bultos_destinatario
# Iterate over the data and write it out row by row.
worksheet.write(row, col, fecha_de_hoy)
worksheet.write(row, col + 1, "SIDAC")
worksheet.write(row, col + 2, "PROVEEDOR" )
worksheet.write(row, col + 3, pedido_destinatario )
worksheet.write(row, col + 4, "SIDAC")
worksheet.write(row, col + 5, nombre_destinatario)
worksheet.write(row, col + 6, direccion_destinatario)
worksheet.write(row, col + 7, codigo_destinatario)
worksheet.write(row, col + 8, poblacion_destinatario)
worksheet.write(row, col + 9, provincia_destinatario)
worksheet.write(row, col + 10, bultos_destinatario)
worksheet.write(row, col + 11, "PESO")
worksheet.write(row, col + 12, "COSTE")
worksheet.write(row, col + 13, "PVP")
worksheet.write(row, col + 14, "trafico@buendialogistica.com")
w+=1
row+=1
workbook.close()