如何获取书签的页码

时间:2011-11-30 16:52:25

标签: python pypdf

from pyPdf import PdfFileReader
f = open('document.pdf', 'rb')
p = PdfFileReader(f)
o = p.getOutlines()

列表对象o由字典对象pyPdf.pdf.Destination(书签)组成,它有许多属性,但我找不到该书签的任何引用页码

如何返回页码,让我们说o[1]书签?


例如o[1].page.idnum返回数字大约是PDF文档中引用页面数量的3倍,我假设引用一些小于页面的对象,因为在整个PDF文档轮廓上运行.page.idnum返回数组数字与PDF文档中的“真实”页码目的地甚至没有线性相关,并且它大约是〜3

的倍数

更新:这个问题与此相同:split a pdf based on outline虽然我不明白作者在那里的自我答案中做了什么。对我而言似乎太复杂了

4 个答案:

答案 0 :(得分:9)

正如@theta指出的那样,“split a pdf based on outline”具有提取页码所需的代码。如果您觉得这很复杂,我会复制部分代码,将页面ID映射到页码并使其成为一个函数。这是一个打印书签页[0]的页码的工作示例:

from pyPdf import PdfFileReader
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
    if _result is None:
        _result = {}
    if pages is None:
        _num_pages = []
        pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
    t = pages["/Type"]
    if t == "/Pages":
        for page in pages["/Kids"]:
            _result[page.idnum] = len(_num_pages)
            _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
    elif t == "/Page":
        _num_pages.append(1)
    return _result
# main
f = open('document.pdf','rb')
p = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(p)
o = p.getOutlines()
pg_num = pg_id_num_map[o[0].page.idnum] + 1
print pg_num
对@theta来说可能为时已晚,但可能会对其他人有所帮助:)顺便说一下我在stackoverflow上发表的第一篇文章,请原谅我,如果我没有遵循通常的格式

进一步扩展: 如果您希望在页面上获取书签的确切位置,这将使您的工作更轻松:

from pyPdf import PdfFileReader
import pyPdf
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
    if _result is None:
        _result = {}
    if pages is None:
        _num_pages = []
        pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
    t = pages["/Type"]
    if t == "/Pages":
        for page in pages["/Kids"]:
            _result[page.idnum] = len(_num_pages)
            _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
    elif t == "/Page":
        _num_pages.append(1)
    return _result
def outlines_pg_zoom_info(outlines, pg_id_num_map, result=None):
    if result is None:
        result = dict()
    if type(outlines) == list:
        for outline in outlines:
            result = outlines_pg_zoom_info(outline, pg_id_num_map, result)
    elif type(outlines) == pyPdf.pdf.Destination:
        title = outlines['/Title']
        result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
        left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))
    return result

# main
pdf_name = 'document.pdf'
f = open(pdf_name,'rb')
pdf = PdfFileReader(f)
# map page ids to page numbers
pg_id_num_map = _setup_page_id_to_num(pdf)
outlines = pdf.getOutlines()
bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)
print bookmarks_info

注意:我的书签是章节编号(例如:1.1简介),我将书签信息映射到章节编号。如果您的书签不同,请修改此部分代码:

    elif type(outlines) == pyPdf.pdf.Destination:
        title = outlines['/Title']
        result[title.split()[0]] = dict(title=outlines['/Title'], top=outlines['/Top'], \
        left=outlines['/Left'], page=(pg_id_num_map[outlines.page.idnum]+1))

答案 1 :(得分:1)

2019年,对于对更快的方式感兴趣的人,可以使用:

from PyPDF2 import PdfFileReader

def printPageNumberFrom(filename):
    with open(filename, "rb") as f:
       pdf = PdfFileReader(f)
       bookmarks = pdf.getOutlines()
       for b in bookmarks:
           print(pdf.getDestinationPageNumber(b) + 1) #page count starts from 0 

答案 2 :(得分:1)

使用vjayky和Giulio D建议以递归方式管理书签

PyPDF2> = v1.25

from PyPDF2 import PdfFileReader

def printBookmarksPageNumbers(pdf):
    def review_and_print_bookmarks(bookmarks, lvl=0):
        for b in bookmarks:
            if type(b) == list:
                review_and_print_bookmarks(b, lvl + 4)
                continue
            pg_num = pdf.getDestinationPageNumber(b) + 1 #page count starts from 0
            print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
    review_and_print_bookmarks(pdf.getOutlines())

with open('document.pdf', "rb") as f:
    pdf = PdfFileReader(f)
    printBookmarksPageNumbers(pdf)

PyPDF2

from PyPDF2 import PdfFileReader

def printBookmarksPageNumbers(pdf):
    # Map page ids to page numbers
    pg_id_to_num = {}
    for pg_num in range(0, pdf.getNumPages()):
        pg_id_to_num[pdf.getPage(pg_num).indirectRef.idnum] = pg_num

    def review_and_print_bookmarks(bookmarks, lvl=0):
        for b in bookmarks:
            if type(b) == list:
                review_and_print_bookmarks(b, lvl + 4)
                continue
            pg_num = pg_id_to_num[b.page.idnum] + 1 #page count starts from 0 
            print("%s%s: Page %s" %(" "*lvl, b.title, pg_num))
    review_and_print_bookmarks(pdf.getOutlines())

with open('document.pdf', "rb") as f:
    pdf = PdfFileReader(f)
    printBookmarksPageNumbers(pdf)

答案 3 :(得分:0)

我不确定,但根据http://pybrary.net/pyPdf/pythondoc-pyPdf.pdf.html#pyPdf.pdf.Destination.page-attribute的pyPdf.Destination文档,书签的页码只是Destination.page。