我严格要求pdf文本并将其写在csv中,但将pdf的每一行写为不同的行。我正在使用pdf miner从pdf中提取文本,然后使用csv writer将其写到CSV文件中。在一个单元格中写入一个文件的内容,但在不同的单元格中写入每一行。任何帮助都感激不尽。下面是到目前为止的代码。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import requests
from StringIO import StringIO
import csv
import pandas as pd
from os import listdir
# exctract pdf text
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(path, pagenos, maxpages=maxpages,
password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
# fp.close()
device.close()
retstr.close()
return text
# get url from csv and download it using request module
input_file = csv.DictReader(open("pdfdata.csv"))
res=[]
for row in input_file:
try:
filename=row['URL'].split("/")[-1].split('.')[0]
response=requests.get(row['URL'])
res.append([convert_pdf_to_txt(StringIO(response.content))])# call
pdf extractor
break
except Exception as e:
pass
fl=open('newpdfdata.csv','wb')
writer = csv.writer(fl)
writer.writerow(['text'])# write header
writer.writerows(res) # write result list to csv