我正在尝试从此pdf文件中以逗号分隔的格式提取表格数据。它太大了,无法与Adobe Pro一起使用,因此我研究过使用Python。 PDF文件位于https://www.live-military-mode-s.eu/pdf/Military%20Mode-S%20codes.pdf。该代码提取数据,但最终得到约360,000行数据。我希望只能导入表数据。没有标题或底部页面信息数据。
from tika import parser
import os
import glob
from easygui import *
from time import sleep
import random
import string
from PIL import Image
path=None
basewidth = 150
img = Image.open('dlogo.jpg')
wpercent = (basewidth/float(img.size[0]))
hsize = int((float(img.size[1])*float(wpercent)))
def converter(filename,savelocation):
parsed = parser.from_file(filename+'.pdf')
text=parsed["content"]
new_name=filename+'.txt'
fname=savelocation+'\\'+new_name.split('\\')[-1]
with open(fname,'w+', encoding='utf-8',errors='ignore') as f:
f.writelines(text)
remove_empty_lines(fname)
return new_name
def remove_empty_lines(filename):
if not os.path.isfile(filename):
print("{} does not exist ".format(filename))
return
with open(filename, errors='ignore') as filehandle:
lines = filehandle.readlines()
with open(filename, 'w',errors='ignore') as filehandle:
lines = filter(lambda x: x.strip(), lines)
filehandle.writelines(lines)
while 1:
msg = "Please Choose a File or Folder"
title = "PDF Converter"
choices = ["Exit","Choose File","Choose Folder"]
reply = buttonbox(msg,title=title,choices=choices)
if reply is 'Exit':
break
elif reply is 'Choose File':
path=fileopenbox()
savelocation=buttonbox("Choose a Save location",title="Saving",choices=["Save Location","Cancel"])
if savelocation is 'Cancel':
continue
savepath=diropenbox()
print(savepath)
filename, file_extension = os.path.splitext(path)
name=converter(filename,savepath)
print(name)
msgbox("File Successfully Converted to Text!!")
elif reply is 'Choose Folder':
path=diropenbox()
savelocation=buttonbox("Choose a Save location",title="Save Location",choices=["Save Location","Cancel"])
if savelocation is 'Cancel':
continue
savepath=diropenbox()
n=1
for i in glob.iglob(path+'\*.pdf'):
randomname=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))
os.rename(i,path+'\\'+str(n)+'_'+str(randomname)+'.pdf')
n+=1
for f in glob.iglob(path+'\*.pdf'):
filename, file_extension = os.path.splitext(f)
name=converter(filename,savepath)
#
msgbox("PDFS Successfully Converted to Text!!")