如果PDF包含8-10页或更多页

时间:2017-11-22 09:52:00

标签: python pdf python-imaging-library pypdf2 image-extraction

我正在尝试从PDF中提取图像并从StackOverflow获取代码。它适用于某些PDF文件,但并非适用于所有文件。我看到一个模式,pdf的页数超过8-10,它没有提取任何东西。我想我在这里错过了一些东西。请帮我搞清楚。这是我正在使用的代码,这里是link to pdf resources

import PyPDF2
import sys
from PIL import Image
import os
import glob
from PyPDF2 import PdfFileReader
def ExtractImages(filename):
    print("\n---------------------------------------")
    print("This is the pdf processing",filename)

    fileObject = PyPDF2.PdfFileReader(open(filename, "rb"))
    print(fileObject)
    pages = fileObject.getNumPages()
    print("Total number of Pages is.....",pages)
    for i in range(2,pages):
        tempPage = fileObject.getPage(i)
        if '/XObject' in tempPage['/Resources']:
            xObject = tempPage['/Resources']['/XObject'].getObject()
            for obj in xObject:
                if xObject[obj]['/Subtype'] == '/Image':
                    size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                    data = xObject[obj].getData()
                    if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
                        mode = "RGB"
                    else:
                        mode = "P"
                    if '/Filter' in xObject[obj]:
                        if xObject[obj]['/Filter'] == '/FlateDecode':

                            img = Image.frombytes(mode, size, data)
                            img.save(obj[1:] + ".png")
                        elif xObject[obj]['/Filter'] == '/DCTDecode':
                            img = open(obj[1:] + ".jpg", "wb")
                            img.write(data)
                            img.close()
                        elif xObject[obj]['/Filter'] == '/JPXDecode':
                            img = open(obj[1:] + ".jp2", "wb")
                            img.write(data)
                            img.close()
                        elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
                            img = open(obj[1:] + ".tiff", "wb")
                            img.write(data)
                            img.close()
                    else:
                        img = Image.frombytes(mode, size, data)
                        img.save(obj[1:] + ".png")
        else:
            print("No image found for file.",filename)

listOfFiles = glob.glob('./*.pdf')
for file in listOfFiles:
    ExtractImages(file)

1 个答案:

答案 0 :(得分:0)

Ubuntu 16.04 - amd64:这里没有错误。

sudo apt install libpoppler-dev libleptonica-dev

git clone https://github.com/allenai/pdffigures.git
cd pdffigures/
make              // The executable 'pdffigures' gets created.
./pdffigures