这是pdf结构的一部分:
5 0 obj
<< /Length 56 >>
stream
BT /F1 12 Tf 100 700 Td 15 TL (JavaScript example) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Font
/Subtype /Type1
/Name /F1
/BaseFont /Helvetica
/Encoding /MacRomanEncoding
>>
endobj
7 0 obj
<<
/Type /Action
/S /JavaScript
我想搜索&#34; javascript&#34;如果它在那里。它的问题是,javascript可以用它的十六进制整体或部分来表示它是&#34; javascript或Jav#61Script或J#61v#61Script等等#34;
所以我怎么能知道javascript是否存在所有这些可能性????
答案 0 :(得分:0)
一次读取一个字符,然后将您找到的任何十六进制转换为字符,也转换为小写。将结果与&#34; javascript&#34;。
进行比较这是一个想法:
import string
import os
import re
def pdf_find_str(pdfname, str):
f = open(pdfname, "rb")
# read the file CHUNK_SIZE chars at a time, keeping last KEEP_SIZE chars
CHUNK_SIZE = 2*1024*1024
KEEP_SIZE = 3 * len(str) # each char might be in #ff form
hexvals = "0123456789abcdef"
ichunk = removed = 0
chunk = f.read(CHUNK_SIZE)
while len(chunk) > 0:
# Loop to find all #'s and replace them with the character they represent.
hpos = chunk.find('#')
while hpos != -1:
if len(chunk)-hpos >= 3 and chunk[hpos+1] in hexvals and chunk[hpos+2] in hexvals:
hex = int(chunk[hpos+1:hpos+3], 16) # next two characters are int value
ch = chr(hex).lower()
if ch in str: # avoid doing this if ch is not in str
chunk = chunk[:hpos] + ch + chunk[hpos+3:]
removed += 2
hpos = chunk.find('#', hpos+1)
m = re.search(str, chunk, re.I)
if m:
return ichunk * (CHUNK_SIZE-KEEP_SIZE) + m.start()
# Transfer last KEEP_SIZE characters to beginning for next round of
# testing since our string may span chunks.
next_chunk = f.read(CHUNK_SIZE - KEEP_SIZE)
if len(next_chunk) == 0: break
chunk = chunk[-KEEP_SIZE:] + next_chunk
ichunk += 1
f.close()
return -1
# On one file:
#if pdf_find_str("Consciousness Explained.pdf", "javascript") != -1:
# print 'Contains "javascript"'
# Recursively on a directory:
for root, dirs, files in os.walk("Books"):
for file in files:
if file.endswith(".pdf"):
position = pdf_find_str(root + "/" + file, "javascript")
if position != -1:
print file, "(", position, ")"
# Note: position returned by pdf_find_str does not account for removed
# characters from #ff representations (if any).
答案 1 :(得分:0)
PDF很难处理。我建议您查看类似PDFMiner的内容。