如何使用PyPDF2每n页拆分PDF?

时间:2017-05-11 19:21:57

标签: pdf pypdf2

我试图学习如何每n页分割一次pdf。

在我的情况下,我想将64p PDF分成几个块,每个块包含四个页面:文件1:第1-4页,文件2:第5-8页等。

我试图了解PyPDF2,但我的高贵压倒了我:

from PyPDF2 import PdfFileWriter, PdfFileReader
pdf = PdfFileReader('my_pdf.pdf')

我想我需要使用addPage进行各种循环并写入文件,直到没有剩下的页面?

2 个答案:

答案 0 :(得分:0)

时间不多但我在寻找帮助试图做同样的事情时遇到了你的问题。 我最终做了以下,这就是你所要求的。请注意,这可能比你要求的要多,但答案就在那里。这是一个粗略的初稿,需要重构和一些变量重命名。

import os
from PyPDF2 import PdfFileReader, PdfFileWriter

def split_pdf(in_pdf, step=1):
    """Splits a given pdf into seperate pdfs and saves 
    those to a supfolder of the parent pdf's folder, called
    splitted_pdf.

    Arguments:
        in_pdf: [str] Absolute path (and filename) of the
                input pdf or just the filename, if the file
                is in the current directory.
        step:   [int] Desired number of pages in each of the
                output pdfs.
    Returns:
        dunno yet
    """
    #TODO: Add choice for output dir
    #TODO: Add logging instead of prints
    #TODO: Refactor
    try:    
        with open(in_pdf, 'rb') as in_file:
            input_pdf = PdfFileReader(in_file)
            num_pages = input_pdf.numPages
            input_dir, filename = os.path.split(in_pdf)
            filename = os.path.splitext(filename)[0]
            output_dir = input_dir + "/" + filename + "_splitted/"
            os.mkdir(output_dir)
            intervals = range(0, num_pages, step)
            intervals = dict(enumerate(intervals, 1))
            naming = f'{filename}_p'

            count = 0
            for key, val in intervals.items():
                output_pdf = PdfFileWriter()
                if key == len(intervals):
                    for i in range(val, num_pages):
                        output_pdf.addPage(input_pdf.getPage(i))
                    nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
                    with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
                        output_pdf.write(outfile)
                    print(f'{naming}{nums}.pdf written to {output_dir}')
                    count += 1
                else:
                    for i in range(val, intervals[key + 1]):
                        output_pdf.addPage(input_pdf.getPage(i))
                    nums = f'{val + 1}' if step == 1 else f'{val + 1}-{val + step}'
                    with open(f'{output_dir}{naming}{nums}.pdf', 'wb') as outfile:
                        output_pdf.write(outfile)
                    print(f'{naming}{nums}.pdf written to {output_dir}')
                    count += 1
    except FileNotFoundError as err:
        print('Cannot find the specified file. Check your input:')
    print(f'{count} pdf files written to {output_dir}')

希望它对你有所帮助。

答案 1 :(得分:0)

from PyPDF2 import PdfFileReader, PdfFileWriter
import os   

# Method to split the pdf at every given n pages.
def split_at_every(self,infile , step = 1):

    # Copy the input file path to a local variable infile
    input_pdf = PdfFileReader(open(infile, "rb"))
    pdf_len = input_pdf.number_of_pages

    # Get the complete file name along with its path and split the text to take only the first part.
    fname = os.path.splitext(os.path.basename(infile))[0]

    # Get the list of page numbers in the order of given step
    # If there are 10 pages in a pdf, and the step is 2
    # page_numbers = [0,2,4,6,8]
    page_numbers = list(range(0,pdf_len,step))

    # Loop through the pdf pages
    for ind,val in enumerate(page_numbers):

        # Check if the index is last in the given page numbers
        # If the index is not the last one, carry on with the If block.
        if(ind+1 != len(page_numbers)):

            # Initialize the PDF Writer
            output_1 = PdfFileWriter()

            # Loop through the pdf pages starting from the value of current index till the value of next index
            # Ex : page numbers = [0,2,4,6,8]
            # If the current index is 0, loop from 1st page till the 2nd page in the pdf doc.
            for page in range(page_numbers[ind], page_numbers[ind+1]):

                # Get the data from the given page number
                page_data = input_pdf.getPage(page)

                # Add the page data to the pdf_writer
                output_1.addPage(page_data)

                # Frame the output file name
                output_1_filename = '{}_page_{}.pdf'.format(fname, page + 1)

            # Write the output content to the file and save it.
            self.write_to_file(output_1_filename, output_1)

        else:

            output_final = PdfFileWriter()
            output_final_filename = "Last_Pages"

            # Loop through the pdf pages starting from the value of current index till the last page of the pdf doc.
            # Ex : page numbers = [0,2,4,6,8]
            # If the current index is 8, loop from 8th page till the last page in the pdf doc.

            for page in range(page_numbers[ind], pdf_len):
                # Get the data from the given page number
                page_data = input_pdf.getPage(page)

                # Add the page data to the pdf_writer
                output_final.addPage(page_data)

                # Frame the output file name
                output_final_filename = '{}_page_{}.pdf'.format(fname, page + 1)

            # Write the output content to the file and save it.
            self.write_to_file(output_final_filename,output_final)