我想抓取这个波斯 pdf 文件的表格并将结果作为 Pandas 数据帧,但我收到错误“NameError: name 'PDFResourceManager' is not defined”并且没有提取任何好的内容。 请帮我找到一个真正的编码解决方案。感谢您提供经过测试的代码。
from pdfminer.converter import TextConverter
from io import StringIO
from io import open
from urllib.request import urlopen
import pdfminer as pm
urlpdf="https://www.codal.ir/Reports/DownloadFile.aspx?id=jck8NF9OtmFW6fpyefK09w%3d%3d"
response = requests.get(urlpdf, verify=False, timeout=5)
f=io.BytesIO(response.content)
def readPDF(f):
rsrcmgr=PDFResourceManager()
retstr=StringIO()
laparams=LAParams()
device=TextConverter(rsrcmgr,retstr,laparams=laparams)
process_pdf(rsrcmgr,device,pdfFile)
device.close()
content=retstr.getvalue()
retstr.close()
return content
pdfFile=urlopen(urlpdf)
outputString=readPDF(pdfFile)
proceedings=outputString.encode('utf-8') # creates a UTF-8 byte object
proceedings=str(proceedings) # creates string representation <- the source of your issue
file=open("extract.txt","w", encoding="utf-8") # encodes str to platform specific encoding.
file.write(proceedings)
file.close()