我正在尝试使用正则表达式抓取一些PDF。我在下面有以下代码:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import re
def pdf_to_text(path):
manager = PDFResourceManager()
retstr = BytesIO()
layout = LAParams(all_texts=True)
device = TextConverter(manager, retstr, laparams=layout)
filepath = open(path, 'rb')
interpreter = PDFPageInterpreter(manager, device)
for page in PDFPage.get_pages(filepath, check_extractable=False):
interpreter.process_page(page)
text = retstr.getvalue()
filepath.close()
device.close()
retstr.close()
return text
if __name__ == "__main__":
text = pdf_to_text("C:\\Users\\User1\\Documents\\Python Scripts\\PDF Scraping\\123.pdf")
regex = r"(?<=3\. Induction Training\n).*(?=4\. Corporate Training\/Departmental Training)"
matches = re.finditer(regex, text, re.MULTILINE | re.DOTALL)
print(text)
我遇到以下错误:
File "C:\Users\User1\AppData\Local\Continuum\anaconda3\lib\re.py", line 230, in finditer
return _compile(pattern, flags).finditer(string)
TypeError: cannot use a string pattern on a bytes-like object
我不确定我的代码底部是否正确,有人可以指出我要去哪里了吗?