我的Python脚本将图像中的PDF文件转换为可在PyTesseract中使用:
def images(inputFile):
pdfFile = wi(filename = inputFile, resolution=600)
formato = 'png'
image = pdfFile.convert(formato)
pag = 0
dfs = []
for img in image.sequence:
pag += 1
img.rotate(90)
# HOCR
with img[1100:4190, 1150:3080] as cropped: #[left:right, top:bottom]
imgPage = wi(image = cropped)
imageBlob = imgPage.make_blob(formato)
horas = gerarHocr(imageBlob)
def gerarHocr(imageBlob):
image = Image.open(io.BytesIO(imageBlob))
markup = pytesseract.image_to_pdf_or_hocr(image, lang='por', extension='hocr')
soup = BeautifulSoup(markup, features='html.parser')
spans = soup.find_all('span', {'class' : 'ocrx_word'})
listHoras = []
for sp in spans:
hora = horaMarcada(sp.get('title').split()[1], sp.get('title').split()[2], sp.get('title').split()[3], sp.get('title').split()[4], sp.get_text().split()[0])
listHoras.append(hora)
return listHoras
images('foo.pdf')
执行后,我在temp文件夹中有大量Magick文件,而Python并未自动删除这些文件。
我尝试了多种解决方案来阻止Wand生成这些文件:
我将 policy.xml 中的<!-- <policy domain="resource" name="disk" value="16EB"/> -->
更改为
name="disk" value="1GiB"
<policymap>
<!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
<!-- <policy domain="resource" name="memory" value="2GiB"/> -->
<!-- <policy domain="resource" name="map" value="4GiB"/> -->
<!-- <policy domain="resource" name="width" value="10MP"/> -->
<!-- <policy domain="resource" name="height" value="10MP"/> -->
<!-- <policy domain="resource" name="area" value="1GB"/> -->
<!-- <policy domain="resource" name="disk" value="1GiB"/> -->
<!-- <policy domain="resource" name="file" value="768"/> -->
<!-- <policy domain="resource" name="thread" value="4"/> -->
<!-- <policy domain="resource" name="throttle" value="0"/> -->
<!-- <policy domain="resource" name="time" value="3600"/> -->
<!-- <policy domain="system" name="precision" value="6"/> -->
<!-- <policy domain="coder" rights="none" pattern="MVG" /> -->
<!-- <policy domain="delegate" rights="none" pattern="HTTPS" /> -->
<!-- <policy domain="path" rights="none" pattern="@*" /> -->
<policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/>
</policymap>
但是没有用。