我制作脚本从pdf文件中提取一些数据,然后保存在excel文件中。
当我将pdf文件转换为文本时,我没有非ASCII字符的问题但是当我保存到excel文件时效果不好。
这是我的代码:
# -*- coding: cp1252 -*-
# -*- coding: UTF-8 -*-
# -*- coding: utf-8 -*-
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
import xlsxwriter
import sys
reload(sys)
sys.setdefaultencoding('Cp1252')
# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('zzzzz.xlsx')
worksheet = workbook.add_worksheet()
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
z = 0
e = (len(files) - 1)
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
fstr = ''
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
str = retstr.getvalue()
fstr += str
fp.close()
device.close()
retstr.close()
return fstr
row = 0
col = 0
while z<e:
factura = files[z]
#ejemplo 1
string = convert_pdf_to_txt(factura)
lines = list(filter(bool,string.split('\n')))
custData = {}
for i in range(len(lines)):
if 'EMAIL:' in lines[i]:
custData['Name'] = lines[i+1]
elif 'FACTURA' in lines[i]:
custData['BillNumber'] = lines[i+1]
elif 'Vencimientos:' in lines[i]:
custData['price'] = lines[i+2]
elif 'Banco:' in lines[i]:
custData['paymentType'] = lines[i+1]
if custData['paymentType'] == "Vencimientos:":
custData['paymentType'] = lines[i-2]
#ejemplo 2
txtList = convert_pdf_to_txt(factura).splitlines()
nameIdx, billNumIdx, priceIdx, expirDateIdx, paymentIdx = -1, -1, -1, -1, -1
for idx, line in enumerate(txtList):
if line == "EMAIL: buendialogistica@gmail.com":
nameIdx = idx +2 # in your example it should be +2...
if line == "FACTURA":
billNumIdx = idx + 1
if "Vencimientos:" in line:
priceIdx = idx + 2
expirDateIdx = idx + 1
if "Banco:" in line:
paymentIdx = idx + 1
name = txtList[nameIdx] if nameIdx != -1 else ''
billNum = txtList[billNumIdx] if billNumIdx != -1 else ''
price = txtList[priceIdx] if priceIdx != -1 else ''
expirDate = txtList[expirDateIdx] if expirDateIdx != -1 else ''
payment = txtList[paymentIdx] if paymentIdx != -1 else ''
print expirDate
billNum = billNum.replace("Â Â ", "")
print billNum
custData['Name'] = custData['Name'].replace("Â", "")
print custData['Name']
custData['paymentType'] = custData['paymentType'].replace("Â", "")
print custData['paymentType']
print price
nombre = str(custData['Name'])
formadepago = custData['paymentType']
z+=1
columna2 = str(billNum) + ", " + nombre + ", " + formadepago
worksheet.write(row, col, expirDate)
worksheet.write(row, col + 1, columna2)
worksheet.write(row, col + 2, price)
row+=1
workbook.close()
这是我现在的错误:
FRECH ESPAÑA S.A.U. = FRECH ESPAÑA S.A.U.
MONTSERRAT CARRIÓ KÖTZ - LA FUSTERIA = MONTSERRAT CARRIÓ KÖTZ - LA FUSTERIA.
我尝试使用非ASCII字符的许多代码,但我无法做任何工作。
我的代码中唯一可以包含非ASCII字符的变量是“nombre”
感谢您的帮助