如何将lxml树对象和lxml模式对象腌制为二进制文件并将其加载

时间:2019-03-25 22:50:47

标签: python-2.7 lxml pickle dill

我试图将lxml元素树对象和lxml模式对象腌制(实际上使用莳萝)到二进制文件,然后用莳萝加载它们,因此我不必每次运行python脚本时都解析这些文件。 XSD模式文件是静态的,它们永远不会改变。但是,我碰到了泡菜(或莳萝)的典型问题,那就是当我重新阅读它们时,它没有调用 init 函数。我仔细阅读了莳萝上的python文档,并阅读了通过许多有关如何腌制类实例的编程文章,并在调用构造函数时将它们加载回去,以便您可以实际使用它们。如果有人可以在这里帮助我,我将不胜感激。

from lxml import etree as ET


        for file_type in self.xml_schemas.keys():
              if os.path.isfile(self.xml_schema_files[file_type].replace("xsd", "xso")) and os.path.isfile(self.xml_schema_files[file_type].replace("xsd", "xst")):

                 with open(self.xml_schema_files[file_type].replace("xsd", "xst"), 'rb') as tree_binary_object:
                      self.xsd_trees[file_type] = dill.load(tree_binary_object).__new__(tree_binary_object)

                 with open(self.xml_schema_files[file_type].replace("xsd", "xso"), 'rb') as schema_binary_object:
                      self.xml_schemas[file_type] = dill.load(schema_binary_object).__new__(schema_binary_object)

              else:

                 xsd_tree = ET.parse(self.xml_schema_files[file_type])

                 self.xsd_trees[file_type] = xsd_tree
                 self.xml_schemas[file_type] = ET.XMLSchema(xsd_tree)

                 with open(self.xml_schema_files[file_type].replace("xsd", "xst"), 'wb') as tree_binary_object:
                      dill.dump(self.xsd_trees[file_type], tree_binary_object)

                 **bold**with open(self.xml_schema_files[file_type].replace("xsd", "xso"), 'wb') as schema_binary_object:
                      dill.dump(self.xml_schemas[file_type], schema_binary_object)

我使用的字典xml_schemas [file_type]具有指向XSD文件的字符串,如您所见,我将xsd扩展名替换为xst以表示腌制的XML模式树对象,而xso表示XML模式对象。我首先检查这些二进制文件是否存在。我什至不确定我是否需要腌制XML模式定义树。我真正想要的是一个XML模式类对象,将其腌制为一个二进制文件,然后重新装入并利用您将使用任何lxml模式对象来验证XML文件的方式。我想我可能必须继承lxml.etree.XMLSchema才能使其工作。如果是这种情况,有人可以给我提供一些示例代码吗?任何帮助将非常感激。谢谢!

更新

好的,因此,我设法使已解析的XML Schema定义树对象腌制。这可能足以加快XML验证脚本中数据的加载速度。因此,响应要求更多代码的注释,这基本上是我目前在mt类中所做的事情:

import os
import dill
import stat
import pprint
import sys
import string
import re
import copy
from lxml import etree as ET

from utilities import line_parser

class xml_processing_utilities:

      def __init__(self):

          self.xml_schemas_location = "/data/scratch/bbarrett/scripts/python/test_case_validation/develop/maestro_files/xml_schemas/"

          self.xml_schema_files = {}

          self.xml_schema_files["CA"] = self.xml_schemas_location + "maestro_test_case.xsd"
          self.xml_schema_files["TC"] = self.xml_schemas_location + "maestro_test_config.xsd"
          self.xml_schema_files["SN"] = self.xml_schemas_location + "maestro_scenario.xsd"
          self.xml_schema_files["NM"] = self.xml_schemas_location + "maestro_node_mapping.xsd"
          self.xml_schema_files["NC"] = self.xml_schemas_location + "maestro_node_config.xsd"

          self.xml_schemas = {}

          self.xml_schemas["CA"] = None
          self.xml_schemas["TC"] = None
          self.xml_schemas["SN"] = None
          self.xml_schemas["NM"] = None
          self.xml_schemas["NC"] = None

          self.xsd_trees = {}

          self.xsd_trees["CA"] = None
          self.xsd_trees["TC"] = None
          self.xsd_trees["SN"] = None
          self.xsd_trees["NM"] = None
          self.xsd_trees["NC"] = None

          self.get_xml_schemas()

      def initialize_xml_data(self, xml_file, file_type):

          if os.path.isfile(xml_file) and self.file_is_readable(xml_file, -1, ""):
             xml_file_lines = self.read_xml_file_lines(xml_file)
          else:
             return False

          if self.validate_maestro_file_type(file_type, xml_file) != True:
             self.PRINT_COLOR.red("Error: " + xml_file + " is not a valid test case " + file_type + " file. It is missing an opening MAESTRO or SimData XML element at the $
             return False

          passed_syntax_check = True
          line_parser.parser_error_log = []

          parser = line_parser.LineNumberingParser(recover=True)
          parser.feed_lines_last_index = len(xml_file_lines) - 1

          try:
            xml_tree = ET.fromstringlist(xml_file_lines, parser)
          except ET.XMLSyntaxError as lxml_exception:
            pass

          if len(parser.feed_error_log) > 0 or len(line_parser.parser_error_log) > 0:
             passed_syntax_check = False
             self.lxml_error_handler(line_parser.parser_error_log, parser.feed_error_log, file_type, xml_file_lines)

          if not passed_syntax_check:
             return False

          if passed_syntax_check:
             passed_schema_check = True

             schema_validation_errors = None

#############xsd_tree = ET.parse(self.xml_schema_files[file_type])
#############self.xml_schemas[file_type] = self.xsd_trees[file_type]
             xml_schema = ET.XMLSchema(self.xsd_trees[file_type])

             try:
               xml_schema.assertValid(xml_tree)
             except ET.DocumentInvalid as schema_validation_errors:
               pass

             if schema_validation_errors != None and hasattr(schema_validation_errors, 'error_log'):
                self.lxml_error_handler(None, schema_validation_errors.error_log, file_type, xml_file_lines)
                passed_schema_check = False

             if passed_schema_check != True:
                return False


      def get_xml_schemas(self):


          for file_type in self.xml_schema_files.keys():

              if os.path.isfile(self.xml_schema_files[file_type].replace("xsd", "xso")) and os.path.isfile(self.xml_schema_files[file_type].replace("xsd", "xst")):

                 with open(self.xml_schema_files[file_type].replace("xsd", "xst"), 'rb') as tree_binary_object:
                      self.xsd_trees[file_type] = dill.load(tree_binary_object)

#################with open(self.xml_schema_files[file_type].replace("xsd", "xso"), 'rb') as schema_binary_object:
######################self.xml_schemas[file_type] = dill.load(schema_binary_object).__init__(self.xsd_trees[file_type])

              else:

                 self.xsd_trees[file_type] = ET.parse(self.xml_schema_files[file_type])

                 with open(self.xml_schema_files[file_type].replace("xsd", "xst"), 'wb') as tree_binary_object:
                      dill.dump(self.xsd_trees[file_type], tree_binary_object)

#################self.xml_schemas[file_type] = ET.XMLSchema(self.xsd_trees[file_type])

#################with open(self.xml_schema_files[file_type].replace("xsd", "xso"), 'wb') as schema_binary_object:
######################dill.dump(self.xml_schemas[file_type], schema_binary_object)

不用担心line_parser。只是XMLParser类的子类,它在XML文件行的字符串列表上使用feed方法来获取行号,以便告诉用户他们在哪一行上都有XML语法错误。

如您所见,腌制(莳萝)lxml模式对象的尝试已被注释掉。当我尝试将脚本中的这些行仍保留在代码中时,这是被轰炸的Python解释器抛出的错误消息:

  File "validate_test_case.py", line 174, in <module>
    if ca_file.endswith(".xml") and UTILS.initialize_xml_data(ca_file, "CA") != False:
  File "/data/scratch/bbarrett/scripts/python/test_case_validation/develop/utilities/xml_utilities.py", line 132, in initialize_xml_data
    self.xml_schemas[file_type].assertValid(xml_tree)
  File "src/lxml/etree.pyx", line 3525, in lxml.etree._Validator.assertValid
  File "src/lxml/xmlschema.pxi", line 111, in lxml.etree.XMLSchema.__call__

因此,很高兴了解如何使该lxml模式对象也被酸洗/未酸洗。再次感谢所有提出建议的人!

更新

在我看来,将XML模式树对象挑选出来,然后用pickle装回它们可能没有太多的速度优势。我看到我的脚本并没有真正比每次运行脚本时从架构定义文件构建树的速度快。如果这种方法在加载XML数据方面并没有增加太多速度,那么有人可能会对如何加快速度提出一些建议吗?我的意思是,XML模式文件不会更改。可能有某种方法可以将已构建的XML树存储到磁盘中,然后在将树写出到数据文件中之后将其快速加载回lxml etree对象中?任何建议,将不胜感激。

更新

我刚刚在代码中发现了逻辑错误。由于总是签入.xso(XML架构对象)文件,因此它一直在构建树,这是使lxml架构定义对象腌制的遗弃工作。因此,代码如下:


      def initialize_xml_data(self, xml_file, file_type):

          if os.path.isfile(xml_file) and self.file_is_readable(xml_file, -1, ""):
             xml_file_lines = self.read_xml_file_lines(xml_file)
          else:
             return False

          if self.validate_maestro_file_type(file_type, xml_file) != True:
             self.PRINT_COLOR.red("Error: " + xml_file + " is not a valid test case " + file_type + " file. It is missing an opening MAESTRO or SimData XML element at the $
             return False

          passed_syntax_check = True
          line_parser.parser_error_log = []

          parser = line_parser.LineNumberingParser(recover=True)
          parser.feed_lines_last_index = len(xml_file_lines) - 1

          try:
            xml_tree = ET.fromstringlist(xml_file_lines, parser)
          except ET.XMLSyntaxError as lxml_exception:
            pass

          if len(parser.feed_error_log) > 0 or len(line_parser.parser_error_log) > 0:
             passed_syntax_check = False
             self.lxml_error_handler(line_parser.parser_error_log, parser.feed_error_log, file_type, xml_file_lines)

          if not passed_syntax_check:
             return False

          if passed_syntax_check:
             passed_schema_check = True

             schema_validation_errors = None

#############xsd_tree = ET.parse(self.xml_schema_files[file_type])
#############self.xml_schemas[file_type] = self.xsd_trees[file_type]
             xml_schema = ET.XMLSchema(self.xsd_trees[file_type])

             try:
               xml_schema.assertValid(xml_tree)
             except ET.DocumentInvalid as schema_validation_errors:
               pass

             if schema_validation_errors != None and hasattr(schema_validation_errors, 'error_log'):
                self.lxml_error_handler(None, schema_validation_errors.error_log, file_type, xml_file_lines)
                passed_schema_check = False

             if passed_schema_check != True:
                return False


      def get_xml_schema_trees(self):

          for file_type in self.xml_schema_files.keys():

#############if os.path.isfile(self.xml_schema_files[file_type].replace("xsd", "xst")):

#############with open(self.xml_schema_files[file_type].replace("xsd", "xst"), 'rb') as tree_binary_object:
###################self.xsd_trees[file_type] = dill.load(tree_binary_object)

 ############with open(self.xml_schema_files[file_type].replace("xsd", "xso"), 'rb') as schema_binary_object:
 #################self.xml_schemas[file_type] = dill.load(schema_binary_object).__init__(self.xsd_trees[file_type])

##########else:

              self.xsd_trees[file_type] = ET.parse(self.xml_schema_files[file_type])

##########with open(self.xml_schema_files[file_type].replace("xsd", "xst"), 'wb') as tree_binary_object:
###################dill.dump(self.xsd_trees[file_type], tree_binary_object)

#####################self.xml_schemas[file_type] = ET.XMLSchema(self.xsd_trees[file_type])

#########with open(self.xml_schema_files[file_type].replace("xsd", "xso"), 'wb') as schema_binary_object:
##################dill.dump(self.xml_schemas[file_type], schema_binary_object)


因此,基本上,我什至不尝试泡菜。还可以删除该功能以尝试酸洗。如果有人可以帮我腌制这些lxml对象,我将非常感激。

1 个答案:

答案 0 :(得分:0)

不妨试试 xmlschema 库。看起来使用该库加载的架构可以毫无问题地进行腌制。