如何使用python minidom从XML中提取数据

时间:2011-09-08 02:04:02

标签: python

给出这个xml文件,我想从中提取数据。但是,我无法从<LandmarkPointListXml>开始提取数据。

XML文件:

  <?xml version="1.0" encoding="utf-8"?>
  <Map xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
  <MapName>er</MapName>
  <MapURL>er.gif</MapURL>
  <Name>er</Name>
  <URL>er.gif</URL>
  <LandmarkPointListXml>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>400</LandmarkPointX>
      <LandmarkPointY>292</LandmarkPointY>
      <LandmarkDesc>my room door</LandmarkDesc>
    </anyType>
    <anyType xsi:type="LandmarkPointProperty">
      <LandmarkPointX>399</LandmarkPointX>
      <LandmarkPointY>219</LandmarkPointY>
      <LandmarkDesc>bro room door</LandmarkDesc>
    </anyType>
  </LandmarkPointListXml>
  <RegionPointListXml />
</Map>

Python程序:

    def GetMapData(self):
        result = ""
        haha = self.XMLdoc.firstChild #root node
        for child in haha.childNodes:
            if (cmp(child.nodeName,'LandmarkPointListXml')==0):
                result = result + '|' + self.loopLandmark(child.childNodes) + '|'
            else:
                result = result + child.firstChild.nodeValue + ','
        return result

    def loopLandmark(self, landmarks):
        result=""
        haha=landmarks.getElementsByTagName('anyType')
        for child in haha.childNodes:
            if (cmp(haha.firstChild.nodeName,'LandmarkPointX') == 0):
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue+','
                ChildNode = ChildNode.nextSibling
                result=result+child.firstChild.nodeValue
        return result

我能够检索结果,“呃,呃,呃,呃,等等”,直到程序到达<LandmarkPointListXml>

3 个答案:

答案 0 :(得分:2)

此代码非常脆弱。它对XML输入做出了强有力的假设,如果以有效的方式修改XML,则会失败(例如,如果不是紧接着的话)。

我建议在解析XML时使用标准库,例如元素树(http://docs.python.org/library/xml.etree.elementtree.html)或lxml(http://lxml.de),它们也可以验证您的XML输入。

我在下面编写的代码使用了Element Tree并处理了你的XML输入(我删除了父类的'self'参数)。它还容忍(忽略)XML元素中的空值。

import xml.etree.ElementTree as ET

def GetMapData( xmlfile ):
    result = ""
    try:
        tree = ET.parse( xmlfile )
    except IOError, e:
        print "Failure Parsing %s: %s" % (xmlfile, e)
    root = tree.getroot() # root node
    for child in root:
        if ( child.tag == 'LandmarkPointListXml' ):
            result += '|' + loopLandmark(child) + '|'
        elif child.text is not None:
            result += child.text + ','
    return result

def loopLandmark( landmarks ):
    result=""
    for landmark in landmarks:
        if ( landmark.tag == 'anyType' ): # check also xsi:type="LandmarkPointProperty"?
            for child in landmark:
                if ( child.text and child.tag in [ 'LandmarkPointX', 'LandmarkPointY' ] ):
                    result += child.text + ','
    return result

GetMapData( 'xml.in' )

答案 1 :(得分:0)

我设法从发布的XML文件中提取出数据。但是觉得它比我提供的答案更简单。要获取每个数据,需要进行大量循环。

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            #Determine if root node <CalibrationData> exist
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if cmp(child.nodeName, 'LandmarkPointListXml')==0:
            result1 = self.loopLandmark(child)
        elif cmp(child.nodeName, 'RegionPointListXml')==0:
            print 'Empty'
        elif cmp(child.nodeName, 'URL')==0:
            result = result + child.firstChild.nodeValue
        else:
            result = result + child.firstChild.nodeValue + ','
    result = result + "|" + result1 + "EMPTY"
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

profile = mapDataClass()
boolean = profile.LoadXMLFile('upload\er.m')
print boolean
result = profile.GetMapData()
print result

答案 2 :(得分:0)

我之前的回答仍未完成。这是一个认为应该没问题的人。

import sys
import socket
import os
from xml.dom.minidom import Document, parse, parseString, Node

class mapDataClass:

def __init__(self):
    self.XMLdoc = Document()
    self.MakeRootNode()

def MakeRootNode(self):
    self.RootNode = self.XMLdoc.createElement('Map')
    self.XMLdoc.appendChild(self.RootNode)

def GetXML_Doc(self):
    return self.XMLdoc

def LoadXMLFile(self, AbsFileName):
    try:
        self.XMLdoc.unlink()
        self.XMLdoc = parse(AbsFileName)
        if (self.XMLdoc.hasChildNodes()): #if not empty
            if (cmp(self.XMLdoc.firstChild.nodeName,
                                'Map') == 0):
                self.RootNode = self.XMLdoc.firstChild

        return True

    except IOError:
        print 'File ' + AbsFileName + ' not found'
        return False

def GetMapData(self):
    result = ""
    result1 = ""
    result2 = ""
    haha = self.XMLdoc.firstChild #root node
    for child in haha.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
            if cmp(child.nodeName, 'LandmarkPointListXml')<>0 and cmp(child.nodeName, 'RegionPointListXml')<>0:
                if cmp(child.nodeName, 'URL')==0:
                    result = result + child.firstChild.nodeValue       
                else:
                    result = result + child.firstChild.nodeValue + ','
            elif cmp(child.nodeName, 'LandmarkPointListXml')==0:
                if child.firstChild is not None:
                    result1 = self.loopLandmark(child)
                else:
                    result1 = 'EMPTY|'
            elif cmp(child.nodeName, 'RegionPointListXml')==0:
                if child.firstChild is None:
                    result2 =  'EMPTY'

    result = result + "|" + result1 + result2
    return result

def loopLandmark(self,landmarks):
    result2=""
    tempResult=""
    haha=landmarks.getElementsByTagName('anyType')
    for i in range(0, len(haha)):
        result2=self.loopAnyType(haha[i])
        if ((i+1)!=len(haha)):
            tempResult = tempResult + result2 + ';'
        else:
            tempResult = tempResult + result2 + '|'
    return tempResult

def loopAnyType(self,anyType):
    result3=""
    haha1=anyType.getElementsByTagName('LandmarkPointX')[0]
    haha2=anyType.getElementsByTagName('LandmarkPointY')[0]
    haha3=anyType.getElementsByTagName('LandmarkDesc')[0]
    result3 = haha1.firstChild.nodeValue + "," + haha2.firstChild.nodeValue + "," + haha3.firstChild.nodeValue
    return result3

data = mapDataClass()
success = data.LoadXMLFile("upload\homeTest.m")
if success:
    print "file loaded"
    print data.GetMapData()
else:
    print "no such file found"