我试图用python检索XML标签的一些信息,我的目标是为每个情境标签id,所有子数据保存一个字典,但我不知道如何处理提取的事实来自文本节点的数据,谢谢。
我的代码:
from xml.dom.minidom import *
import requests
print("GETTING XML...")
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need
if resp.status_code != 200:
raise ApiError('GET /tasks/ {}'.format(resp.status_code))
print("XML RECIBIDO 200 OK")
#resp.raw.decode_content = True
print("GUARDANDO XML")
with open("DGT_DATEX.xml", "wb") as handle:
for data in (resp.iter_content()):
handle.write(data)
print("XML GUARDADO")
print("INICIANDO PARSEO..")
dom3 = parse("DGT_DATEX.xml")
print(dom3)#memory dir
print("DATEX PARSEADO")
def getText(nodelist):
dict = {}
listofdata = list()
for node in nodelistofPayloadTag:
if node.nodeType != node.TEXT_NODE:
dict[node.getAttribute('id')] = listofdata
listofdata = goDeep(node.childNodes ,listofdata)
print(str.format("El diccionario antes de ser retornado es {0}", dict))
return dict
def goDeep(childsOfElement, l):
for i in childsOfElement:
if i.nodeType != i.TEXT_NODE:
goDeep(i.childNodes, l)
else:
l.append(i.data)
return l
def getSituation(payloadTag):
getText(payloadTag.childNodes)
def getPayLoad(dom):
print(str.format("Tag to be processed:{0}",dom.getElementsByTagNameNS('*', 'payloadPublication')[0]))
getSituation(dom.getElementsByTagNameNS('*', 'payloadPublication')[0])
print(str.format("Verificando que el dato retornado es un diccionario, {0}, y contiene {1}", type(getPayLoad(dom3)), getPayLoad(dom3)))
答案 0 :(得分:0)
这是允许我从孩子那里收集数据的方式,谢谢
import xml.etree.ElementTree as ET
from xml.dom.minidom import *
import requests
print("GETTING XML...")
resp = requests.get('http://infocar.dgt.es/datex2/dgt/SituationPublication/all/content.xml', stream = True) #XML that I need
if resp.status_code != 200:
raise ApiError('GET /tasks/ {}'.format(resp.status_code))
print("XML RECIBIDO 200 OK")
#resp.raw.decode_content = True
print("GUARDANDO XML")
with open("DGT_DATEX.xml", "wb") as handle:
for data in (resp.iter_content()):
handle.write(data)
print("XML GUARDADO")
print("INICIANDO PARSEO..")
dom3 = parse("DGT_DATEX.xml")
print(dom3)#memory dir
print("DATEX PARSEADO")
def getAttributeID(element):
return element.getAttribute('id')
def getText(element):
return element.data
def getPayLoad(dom):
dict = {}
index = 1 #esto sirve para relacionar los atributos con el situation que les corresponde
indexRecord = 1 #esto sirve para relacionar los atributos con el situationRecord que les corresponde
for i in dom.getElementsByTagNameNS('*', 'situation'):
#Por cada situation del XML vamos a sacar el situation id y todos los campos que pertecen a este de la siguiente manera
print(str.format("Situation ID: {0} numero {1}", getAttributeID(i), index))
print(getText(dom.getElementsByTagNameNS('*','confidentiality')[index].firstChild))#por ejemplo aquí, se coge el first text de la lista de atributos confidentiality dado el index, que nos indica la relacion con el situation
print(getText(dom.getElementsByTagNameNS('*', 'informationStatus')[index].firstChild))
for record in dom.getElementsByTagNameNS('*', 'situation')[index].childNodes:#buscamos el hijo del corespondiente situation que tenga un ID, lo que nos deveulve elsituationRecord
if record.nodeType != record.TEXT_NODE:
print(str.format("SituationRecord ID: {0} numero {1}", getAttributeID(record), indexRecord))
print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationReference')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'situationRecordCreationTime')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersion')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'situationRecordVersionTime')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'situationRecordFirstSupplierVersionTime')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'probabilityOfOccurrence')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'sourceCountry')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'sourceIdentification')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'validityStatus')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'overallStartTime')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'overallEndTime')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'impactOnTraffic')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'locationDescriptor')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'tpegDirection')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'latitude')[indexRecord].firstChild))
print(getText(dom.getElementsByTagNameNS('*', 'longitude')[indexRecord].firstChild))
print(str.format("VALUE FIELD: {0}", getText(dom.getElementsByTagNameNS('*', 'descriptor')[indexRecord].firstChild)))
indexRecord = indexRecord + 1
index = index + 1
getPayLoad(dom3)