我正在尝试从PDF中提取图像并从StackOverflow
获取代码。它适用于某些PDF文件,但并非适用于所有文件。我看到一个模式,pdf的页数超过8-10,它没有提取任何东西。我想我在这里错过了一些东西。请帮我搞清楚。这是我正在使用的代码,这里是link to pdf resources
import PyPDF2
import sys
from PIL import Image
import os
import glob
from PyPDF2 import PdfFileReader
def ExtractImages(filename):
print("\n---------------------------------------")
print("This is the pdf processing",filename)
fileObject = PyPDF2.PdfFileReader(open(filename, "rb"))
print(fileObject)
pages = fileObject.getNumPages()
print("Total number of Pages is.....",pages)
for i in range(2,pages):
tempPage = fileObject.getPage(i)
if '/XObject' in tempPage['/Resources']:
xObject = tempPage['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
else:
print("No image found for file.",filename)
listOfFiles = glob.glob('./*.pdf')
for file in listOfFiles:
ExtractImages(file)
答案 0 :(得分:0)
Ubuntu 16.04 - amd64:这里没有错误。
sudo apt install libpoppler-dev libleptonica-dev
git clone https://github.com/allenai/pdffigures.git
cd pdffigures/
make // The executable 'pdffigures' gets created.
./pdffigures