Question

给出这个xml文件，我想从中提取数据。但是，我无法从<LandmarkPointListXml>开始提取数据。

XML文件：

  <?xml version="1.0" encoding="utf-8"?>
  <Map xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
  <MapName>er</MapName>
  <MapURL>er.gif</MapURL>
  <Name>er</Name>
  <URL>er.gif</URL>
  <LandmarkPointListXml>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>400</LandmarkPointX>
      <LandmarkPointY>292</LandmarkPointY>
      <LandmarkDesc>my room door</LandmarkDesc>
    </anyType>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>399</LandmarkPointX>
      <LandmarkPointY>219</LandmarkPointY>
      <LandmarkDesc>bro room door</LandmarkDesc>
    </anyType>
  </LandmarkPointListXml>
  <RegionPointListXml />
</Map>

Python程序：

    def GetMapData(self):
        result = ""
        haha = self.XMLdoc.firstChild #root node
        for child in haha.childNodes:
            if (cmp(child.nodeName,'LandmarkPointListXml')==0):
                result = result + '|' + self.loopLandmark(child.childNodes) + '|'
            else:
                result = result + child.firstChild.nodeValue + ','
        return result

    def loopLandmark(self, landmarks):
        result=""
        haha=landmarks.getElementsByTagName('anyType')
        for child in haha.childNodes:
            if (cmp(haha.firstChild.nodeName,'LandmarkPointX') == 0):
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue
        return result

我能够检索结果，“呃，呃，呃，呃，等等”，直到程序到达<LandmarkPointListXml>。

Answer 1

此代码非常脆弱。它对XML输入做出了强有力的假设，如果以有效的方式修改XML，则会失败（例如，如果不是紧接着的话）。

我建议在解析XML时使用标准库，例如元素树（http://docs.python.org/library/xml.etree.elementtree.html）或lxml（http://lxml.de），它们也可以验证您的XML输入。

我在下面编写的代码使用了Element Tree并处理了你的XML输入（我删除了父类的'self'参数）。它还容忍（忽略）XML元素中的空值。

import xml.etree.ElementTree as ET

def GetMapData( xmlfile ):
    result = ""
    try:
        tree = ET.parse( xmlfile )
    except IOError, e:
        print "Failure Parsing %s: %s" % (xmlfile, e)
    root = tree.getroot() # root node
    for child in root:
        if ( child.tag == 'LandmarkPointListXml' ):
            result += '|' + loopLandmark(child) + '|'
        elif child.text is not None:
            result += child.text + ','
    return result

def loopLandmark( landmarks ):
    result=""
    for landmark in landmarks:
        if ( landmark.tag == 'anyType' ): # check also xsi:type="LandmarkPointProperty"?
            for child in landmark:
                if ( child.text and child.tag in [ 'LandmarkPointX', 'LandmarkPointY' ] ):
                    result += child.text + ','
    return result

GetMapData( 'xml.in' )

Answer 2

我设法从发布的XML文件中提取出数据。但是觉得它比我提供的答案更简单。要获取每个数据，需要进行大量循环。

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            #Determine if root node <CalibrationData> exist
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if cmp(child.nodeName, 'LandmarkPointListXml')==0:
            result1 = self.loopLandmark(child)
        elif cmp(child.nodeName, 'RegionPointListXml')==0:
            print 'Empty'
        elif cmp(child.nodeName, 'URL')==0:
            result = result + child.firstChild.nodeValue
        else:
            result = result + child.firstChild.nodeValue + ','
    result = result + "|" + result1 + "EMPTY"
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

profile = mapDataClass()
boolean = profile.LoadXMLFile('upload\er.m')
print boolean
result = profile.GetMapData()
print result

Answer 3

我之前的回答仍未完成。这是一个认为应该没问题的人。

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString, Node

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    result1 = ""
    result2 = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
            if cmp(child.nodeName, 'LandmarkPointListXml')<>0 and cmp(child.nodeName, 'RegionPointListXml')<>0:
                if cmp(child.nodeName, 'URL')==0:
                    result = result + child.firstChild.nodeValue       
                else:
                    result = result + child.firstChild.nodeValue + ','
            elif cmp(child.nodeName, 'LandmarkPointListXml')==0:
                if child.firstChild is not None:
                    result1 = self.loopLandmark(child)
                else:
                    result1 = 'EMPTY|'
            elif cmp(child.nodeName, 'RegionPointListXml')==0:
                if child.firstChild is None:
                    result2 =  'EMPTY'

    result = result + "|" + result1 + result2
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

data = mapDataClass()
success = data.LoadXMLFile("upload\homeTest.m")
if success:
    print "file loaded"
    print data.GetMapData()
else:
    print "no such file found"

如何使用python minidom从XML中提取数据

3 个答案: