Question

我试图用python检索XML标签的一些信息，我的目标是为每个情境标签id，所有子数据保存一个字典，但我不知道如何处理提取的事实来自文本节点的数据，谢谢。

我的代码：

from xml.dom.minidom import *
import requests

print("GETTING XML...")
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need
if resp.status_code != 200:
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
print("XML RECIBIDO 200 OK")
#resp.raw.decode_content = True
print("GUARDANDO XML")
with open("DGT_DATEX.xml", "wb") as handle:
    for data in (resp.iter_content()):
        handle.write(data)

print("XML GUARDADO")
print("INICIANDO PARSEO..")
dom3 = parse("DGT_DATEX.xml")
print(dom3)#memory dir
print("DATEX PARSEADO")




def getText(nodelist):

    dict = {}
    listofdata = list()
    for node in nodelistofPayloadTag:
        if node.nodeType != node.TEXT_NODE:
            dict[node.getAttribute('id')]  = listofdata
            listofdata = goDeep(node.childNodes ,listofdata)

    print(str.format("El diccionario antes de ser retornado es {0}", dict))
    return dict

def goDeep(childsOfElement, l):

    for i in childsOfElement:
        if i.nodeType != i.TEXT_NODE:
            goDeep(i.childNodes, l)
        else:
            l.append(i.data)

    return l

def getSituation(payloadTag):

    getText(payloadTag.childNodes)



def getPayLoad(dom):
    print(str.format("Tag to be processed:{0}",dom.getElementsByTagNameNS('*', 'payloadPublication')[0]))
    getSituation(dom.getElementsByTagNameNS('*', 'payloadPublication')[0])


print(str.format("Verificando que el dato retornado es un diccionario, {0}, y contiene {1}", type(getPayLoad(dom3)), getPayLoad(dom3)))

Answer 1

这是允许我从孩子那里收集数据的方式，谢谢

import xml.etree.ElementTree as ET

from xml.dom.minidom import *

import requests

print("GETTING XML...")
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need
if resp.status_code != 200:
    raise ApiError('GET /tasks/ {}'.format(resp.status_code))
print("XML RECIBIDO 200 OK")
#resp.raw.decode_content = True
print("GUARDANDO XML")
with open("DGT_DATEX.xml", "wb") as handle:
    for data in (resp.iter_content()):
        handle.write(data)

print("XML GUARDADO")
print("INICIANDO PARSEO..")
dom3 = parse("DGT_DATEX.xml")
print(dom3)#memory dir
print("DATEX PARSEADO")

def getAttributeID(element):
    return element.getAttribute('id')

def getText(element):
    return element.data

def getPayLoad(dom):
    dict = {}
    index = 1 #esto sirve para relacionar los atributos con el situation que les corresponde
    indexRecord = 1 #esto sirve para relacionar los atributos con el situationRecord que les corresponde
    for i in dom.getElementsByTagNameNS('*', 'situation'):
        #Por cada situation del XML vamos a sacar el situation id y todos los campos que pertecen a este de la siguiente manera
        print(str.format("Situation ID: {0} numero {1}", getAttributeID(i), index))
        print(getText(dom.getElementsByTagNameNS('*','confidentiality')[index].firstChild))#por ejemplo aquí, se coge el first text de la lista de atributos confidentiality dado el index, que nos indica la relacion con el situation
        print(getText(dom.getElementsByTagNameNS('*', 'informationStatus')[index].firstChild))
        for record in dom.getElementsByTagNameNS('*', 'situation')[index].childNodes:#buscamos el hijo del corespondiente situation que tenga un ID, lo que nos deveulve elsituationRecord
            if record.nodeType != record.TEXT_NODE:
                print(str.format("SituationRecord ID: {0} numero {1}", getAttributeID(record), indexRecord))
                print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationReference')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationTime')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersion')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersionTime')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'situationRecordFirstSupplierVersionTime')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'probabilityOfOccurrence')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'sourceCountry')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'sourceIdentification')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'validityStatus')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'overallStartTime')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'overallEndTime')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'impactOnTraffic')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'locationDescriptor')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'tpegDirection')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'latitude')[indexRecord].firstChild))
                print(getText(dom.getElementsByTagNameNS('*', 'longitude')[indexRecord].firstChild))
                print(str.format("VALUE FIELD: {0}", getText(dom.getElementsByTagNameNS('*', 'descriptor')[indexRecord].firstChild)))
                indexRecord = indexRecord + 1
        index = index + 1

getPayLoad(dom3)

Python XML DOM收集元素数据

1 个答案: