我使用以下代码来读取pdf文件,但它没有读取它。可能是什么原因?
>>> import os
>>> from PyPDF2 import PdfFileReader, PdfFileWriter
>>> path = "/Users/Rahul/Desktop/Dfiles/"
>>> dirs = os.listdir( path )
>>> directory = "/Users/Rahul/Desktop/Dfiles/106_2015_34-76357.pdf"
>>> f = open(directory, 'rb')
>>> reader = PdfFileReader(f)
>>> contents = reader.getPage(0).extractText().split('\n')
>>> f.close()
>>> print contents
输出是[u'']而不是阅读内容。
答案 0 :(得分:1)
import re
import PyPDF2
pdfFileObj = open('E://drive-download-20171015T225604Z-001/test_case/test2/try/xyz.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print("Number of pages:-"+str(pdfReader.numPages))
num = pdfReader.numPages
i =0
while(i<num):
pageObj = pdfReader.getPage(i)
text=pageObj.extractText()
text1 = text.lower()
for line in text1:
if(re.search("abc",line)):
print(line)
i= i+1
我用它来逐页迭代pdf并在其中搜索关键术语并进一步处理。
答案 1 :(得分:0)
可能这可以帮助您阅读PDF。
import pyPdf
def getPDFContent(path):
content = ""
pages = 10
p = file(path, "rb")
pdf_content = pyPdf.PdfFileReader(p)
for i in range(0, pages):
content += pdf_content.getPage(i).extractText() + "\n"
content = " ".join(content.replace(u"\xa0", " ").strip().split())
return content
答案 2 :(得分:0)
我认为您需要指定光盘名称,它在您的目录中缺失。例如&#34; D:/Users/Rahul/Desktop/Dfiles/106_2015_34-76357.pdf"。我试过,我可以毫无问题地阅读。
或者,如果您想使用未与目录关联的os模块找到文件路径,可以尝试以下操作:
from PyPDF2 import PdfFileReader
import os
def find(name, path):
for root, dirs, files in os.walk(path):
if name in files:
return os.path.join(root, name)
directory = find('106_2015_34-76357.pdf', 'D:/Users/Rahul/Desktop/Dfiles/')
f = open(directory, 'rb')
reader = PdfFileReader(f)
contents = reader.getPage(0).extractText().split('\n')
f.close()
print(contents)
查找功能可以在Nadia Alramli的答案Find a file in python
中找到答案 3 :(得分:0)
要从目录中的多个文件夹中读取文件,可以使用以下代码- 该示例用于读取pdf文件:
import os
from tika import parser
path = "/usr/local/" # path directory
directory=os.path.join(path)
for r,d,f in os.walk(directory): #going through subdirectories
for file in f:
if ".pdf" in file: # reading only PDF files
file_join = os.path.join(r, file) #getting full path
file_data = parser.from_file(file_join) # parsing the PDF file
text = file_data['content'] # read the content
print(text) #print the content
答案 4 :(得分:0)
def getTextPDF(pdfFileName,password=''):
import PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
from nltk import sent_tokenize
""" Extract Text from pdf """
pdf_file=open(pdfFileName,'rb')
read_pdf=PyPDF2.PdfFileReader(pdf_file)
if password !='':
read_pdf.decrypt(password)
text=[]
for i in range(0,read_pdf.getNumPages()):
text.append(read_pdf.getPage(i).extractText())
text = '\n'.join (text).replace("\n",'')
text = sent_tokenize(text)
return text
答案 5 :(得分:-2)
你好Rahul Pipalia,
如果没有在python中安装PyPDF2
,请在使用此模块后首先安装PyPDF2
。
terminal
sudo apt-get install python-pypdf
请尝试以下代码,
# Import Library
import PyPDF2
# Which you want to read file so give file name with ".pdf" extension
pdf_file = open('Your_Pdf_File_Name.pdf')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
#Give page number of the pdf file (How many page in pdf file).
# @param Page_Nuber_of_the_PDF_file: Give page number here i.e 1
page = read_pdf.getPage(Page_Nuber_of_the_PDF_file)
page_content = page.extractText()
# Display content of the pdf
print page_content
从下面的链接下载PDF并试用此代码, https://www.dropbox.com/s/4qad66r2361hvmu/sample.pdf?dl=1
我希望我的回答很有帮助 如果有任何疑问请征求意见。