我在不同的python文件中编写了一些函数。我现在尝试将它们放在一个文件中,但是我不知道如何使它们正确地一个接一个地运行。
我有:
Convert
,它在输入中输入pdf并将其转换为XML Uniform_cm
,使用XML更改XML中的一些属性值并返回修改后的XML文件due
,它使用uniform_cm
的XML文件,进行一些更改(调用另一个函数getBBoxFirstValue
)并返回修改后的XML文件newline
,它使用due
的XML文件,进行一些更改并返回修改后的XML文件main
,它在输入中使用换行符的XML文件,并输出一个数据帧,该数据帧列出了我编写的某些正则表达式的匹配项数量。此函数调用:get_xml_by_tag_names
,find_regex
,find_regex_fasi
,clean
,clean_fasi
和search_delete_append
。如果我在不同的文件上运行它们,那么一切正常,但是我正在尝试创建唯一的文件。在这种情况下如何使它们正常工作? 这是代码:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io
import lxml.etree as etree
import re
from xml.dom import minidom
from xml.etree import ElementTree as ET
import pandas as pd
def convert(case, pdfpath, targetfilepath, pages=100):
if not pages: pagenums = set();
else: pagenums = set(pages);
manager = PDFResourceManager()
codec = 'utf-8'
caching = True
word_margin = 1
laparams2 = LAParams(all_texts=True, detect_vertical=True,
line_overlap=0.5, char_margin=1000.0, #set char_margin to a large number
line_margin=0.5, word_margin=0.5,
boxes_flow=0.5)
if case == 'text':
output = io.StringIO()
converter = TextConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'HTML':
output = io.BytesIO()
converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
if case == 'XML':
output = io.BytesIO()
converter = XMLConverter(manager, output, codec=codec, laparams= laparams2)
interpreter = PDFPageInterpreter(manager, converter)
infile = open(pdfpath, 'rb')
for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
interpreter.process_page(page)
convertedPDF = output.getvalue()
infile.close(); converter.close(); output.close()
convertedFile = open(targetfilepath, 'wb')
convertedFile.write(convertedPDF)
convertedFile.close()
return targetfilepath
def uniform_cm(targetfilepath):
parser = etree.XMLParser(remove_blank_text=True)
ntree = etree.parse(targetfilepath, parser)
for node in ntree.getiterator():
if node.attrib.get('size') == '10.439':
node.attrib['size'] = '10.238'
root = ntree.getroot()
for node in ntree.getiterator():
if node.attrib.get('size') == '10.060' or node.attrib.get('size') =='10.958':
node.attrib['size'] = '10.482'
root = ntree.getroot()
for node in ntree.getiterator():
if node.attrib.get('size') == '8.988' or node.attrib.get('size') =='6.926' :
node.attrib['size'] = '10.238'
root = ntree.getroot()
#print(ET.tostring(root))
return ntree
# Get the first BBox value as float
# Return null if not found
def getBBoxFirstValue(line):
if line is not None:
bb = line.attrib.get('bbox')
if bb is not None:
try:
return float(bb.split(",")[0])
except ValueError:
pass
return None
# Remove all 'textline' elements
def due(ntree):
etree.strip_tags(ntree, 'textline')
# Search for all text "textbox" elements
for textbox in ntree.xpath('//textbox'):
new_line = etree.Element("new_line")
previous_bb = None
# From a given textbox element, iterate over all the "text" elements
for x in textbox.iter("text"):
# Get current bb valu
bb = getBBoxFirstValue(x)
# Check current and past values aren't empty
if bb is not None and previous_bb is not None and (bb - previous_bb) > 20:
# Inserte newline into parent tag
x.getparent().insert(x.getparent().index(x), new_line)
# A new "new_line" element is created
new_line = etree.Element("new_line")
# Append current element is new_line tag
new_line.append(x)
# Keep latest non empty BBox 1st value
if bb is not None:
previous_bb = bb
# Add last new_line element if not null
textbox.append(new_line)
tree = ntree
return tree
#tree.write("outputfet1c822.xml", pretty_print=True)
#print(convert('XML', 'fet1c8.pdf', 'fet1c8.xml', pages=None))
def newline(tree):
root = tree.getroot()
for new_line_block in tree.xpath('//new_line'):
# Find all "text" element in the new_line block
list_text_elts = new_line_block.findall('text')
# Iterate over all of them with the current and previous ones
for previous_text, current_text in zip(list_text_elts[:-1], list_text_elts[1:]):
# Get size elements
prev_size = previous_text.attrib.get('size')
curr_size = current_text.attrib.get('size')
# If they are equals and not both null
if curr_size == prev_size and curr_size is not None:
# Get current and previous text
pt = previous_text.text if previous_text.text is not None else ""
ct = current_text.text if current_text.text is not None else ""
# Add them to current element
current_text.text = pt + ct
# Remove preivous element
previous_text.getparent().remove(previous_text)
newtree = etree.tostring(root, encoding='utf-8', pretty_print=True)
# newtree = newtree.decode("utf-8")
return newtree
def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2):
data = {}
xml_tree = minidom.parse(xml_path)
item_group_nodes = xml_tree.getElementsByTagName(tag_name_1)
for idx, item_group_node in enumerate(item_group_nodes):
cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2)
for _ in cl_compile_nodes:
data[idx]=[item_group_node.toxml()]
return data
def find_regex_fasi(regex, text):
matches_fasi = re.findall(regex, text)
return len(matches_fasi)
def find_regex(regex, text, opzione2= None, opzione3 = None, opzione4 = None):
lista = []
for x in text:
matches_prima = re.findall(regex, x)
matches_prima2 = []
matches_prima3 = []
matches_prima4 = []
if opzione2 is not None:
matches_prima2 = re.findall(opzione2, x)
if opzione3 is not None:
matches_prima3 = re.findall(opzione3, x)
if opzione4 is not None:
matches_prima4 = re.findall(opzione4, x)
lunghezza = len(matches_prima) + len(matches_prima2) + len(matches_prima3) + len(matches_prima4)
lista.append(lunghezza)
#print(matches_prima, matches_prima2, matches_prima3, matches_prima4)
return sum(lista)
def search_delete_append(dizionario, dizionariofasi):
deletekeys = []
insertvalues = []
for k in dizionario:
for v in dizionario[k]:
if "7.489" in v:
deletekeys.append(k)
dizionariofasi[k] = v
for item in deletekeys:
del dizionario[item]
def clean(dizionario, lista):
for value in dizionario.values():
myxml = ' '.join(value)
tree = ET.fromstring(myxml)
tmpstring = ' '.join(text.text for text in tree.findall('text'))
for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")", "\n", "/", "\uf8ee", "−", "[]","↔", "…" ):
tmpstring = tmpstring.replace(to_remove, "")
lista.append(tmpstring)
return lista
def clean_fasi(dizionario, lista):
pattern = re.compile("(?<!\d)\d{2}(?!\d)")
for value in dizionario.values():
myxml2 = ''.join(value)
tree2 = ET.fromstring(myxml2)
tmpstring2 = ' '.join(text.text for text in tree2.findall('text'))
for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")", "\n", "/", "\uf8ee", "−", "[]","↔", "…"):
tmpstring2 = tmpstring2.replace(to_remove, "")
tmpstring2 = pattern.sub("", tmpstring2)
lista.append(tmpstring2)
return lista
def main(newtree):
convert('XML', 'fet1c8.pdf', 'fet1c8.xml', pages=None)
uniform_cm('fet1c8.xml')
dict_fasi = {}
data = get_xml_by_tag_names(newtree, 'new_line', 'text')
search_delete_append(data, dict_fasi)
testoo = []
clean(data, testoo)
find_prima = re.compile(r"\]\s*prima(?!\S)"
df2 = pd.DataFrame([find_regex(find_prima, testoo),
])
df2.rename(index={0: 'prima'}, inplace=True)
df2.to_csv('csv/testonormalefet1c8.csv')
#################
testo_fasi = []
values = [x for x in dict_fasi.values()]
myxml_fasi = ' '.join(values)
find_CM = re.compile(r"10\.238")
find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono?
#print(myxml_fasi)
clean_fasi(dict_fasi, testo_fasi)
#testo_fasi = ''.join(testo_fasi)
print(testo_fasi)
df = pd.DataFrame([find_regex(find_prima, testo_fasi),
])
df.rename(index={0: 'prima'}, inplace=True)
print(df)
print(df2)
df.to_csv('csv/testofasifet1c8.csv')