在元素中找不到标签值

时间:2019-03-14 19:26:29

标签: python xml pandas

我正在尝试将XML文件解析为pandas Dataframe。我的根元素是<Games>,其中包含一个元素<Game>。我想在<Event>元素内检索标签值。

我认为它与元素记录上的find()函数很直接,但是它始终不返回任何内容。

import xml.etree.ElementTree as ET
import pandas as pd
from xml_model import XMLTagsLowerLevel, XMLTagsUpperLevel

class XMLParser:
    BASE_PATH = "../data/1-19/"


    def __init__(self, file_path=BASE_PATH + "f24-24-2016-853139-eventdetails.xml"):
        """
        Initializes the XMLParser class instance.
        :param file_path: Path to input xml file containing all the jobs data.
        """
        self.file_path = file_path


    def xml_to_pandas_df(self):
        """
        Using the standard xml python library, we parse the data xml file and convert the xml data to a pandas
        data frame.
        :return: A pandas data frame instance containing all the event data.
        """
        tree = ET.parse(self.file_path)
        root = tree.getroot()

        event_data = dict()
        for tag in XMLTagsLowerLevel:
            event_data[tag.value] = []

        for i, page in enumerate(root.findall(XMLTagsUpperLevel.GAME)):
            for j, record in enumerate(page.findall(XMLTagsUpperLevel.EVENT)):
                for tag in XMLTagsLowerLevel:
                    temp = record.find(tag.value)
                    if temp is not None:
                        event_data[tag.value].append(temp.text)
                    else:
                        event_data[tag.value].append("")

        return pd.DataFrame(data=event_data)


parser = XMLParser()
jobs_df = parser.xml_to_pandas_df()
print(jobs_df.columns)
print(jobs_df.head())
print(jobs_df.shape)

我的XML模型:

# XML model
from enum import Enum


class XMLTagsUpperLevel:
    """
    This class defines the XML tag constants at the higher level of XML tree. The tag <record> is found below the tag
    <page> in the tree hierarchy.
    """
    EVENT = "Event"
    GAMES = "Games"
    GAME = "Game"
    QUALIFIER = "Q"



class XMLTagsLowerLevel(Enum):
    """
    This class defines all the XML tag constants that are one level below the <record> tag. This is defined as an
    enumerated type for ease of iterating over all tags.
    """
    ID = "id"
    EVENT = "event_id"
    TYPE = "type_id"
    PERIOD = "period_id"
    MINUTE = "min"
    SECOND = "sec"
    PLAYER = "player_id"
    TEAM = "team_id"
    OUTCOME = "outcome"
    X = "x"
    Y = "y"
    TIMESTAMP = "timestamp"
    LAST_MODIFIED = "last_modified"
    VERSION = "version"

这是我的xml文件格式。

<Games timestamp="2016-12-02T09:06:51">
  <Game id="853139" away_team_id="143" away_team_name="Lyon" competition_id="24" competition_name="French Ligue 1" game_date="2016-08-14T14:00:00" home_team_id="148" home_team_name="Nancy" matchday="1" period_1_start="2016-08-14T14:00:25" period_2_start="2016-08-14T15:02:29" season_id="2016" season_name="Season 2016/2017">
    <Event id="1195160021" event_id="1" type_id="34" period_id="16" min="0" sec="0" team_id="143" outcome="1" x="0.0" y="0.0" timestamp="2016-08-14T13:08:34.349" last_modified="2016-08-14T13:59:59" version="1471179598746">
      <Q id="1117749718" qualifier_id="194" value="59963" />
      <Q id="1807420796" qualifier_id="30" value="59957, 54772, 37832, 59963, 44488, 52775, 169007, 168568, 59966, 166552, 149519, 220560, 173211, 55305, 107641, 37852, 59956, 71389" />
      <Q id="450557206" qualifier_id="197" value="645" />
      <Q id="1671039854" qualifier_id="131" value="1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="108315093" qualifier_id="227" value="0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="582175015" qualifier_id="44" value="1, 2, 2, 3, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5" />
      <Q id="1069121575" qualifier_id="130" value="4" />
      <Q id="459298302" qualifier_id="59" value="1, 20, 15, 21, 2, 3, 14, 8, 10, 18, 27, 22, 4, 7, 12, 28, 30, 31" />
    </Event>
    <Event id="2066606636" event_id="1" type_id="34" period_id="16" min="0" sec="0" team_id="148" outcome="1" x="0.0" y="0.0" timestamp="2016-08-14T13:08:35.580" last_modified="2016-08-14T15:03:52" version="1471183432594">
      <Q id="891471807" qualifier_id="194" value="171101" />
      <Q id="201984211" qualifier_id="30" value="38816, 80799, 43024, 9980, 170034, 171101, 210460, 214472, 51327, 38008, 97290, 63600, 152337, 209874, 44314, 214473, 93498, 54911" />
      <Q id="478809608" qualifier_id="131" value="1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="974533808" qualifier_id="227" value="0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="193300652" qualifier_id="44" value="1, 2, 2, 3, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5" />
      <Q id="1493018979" qualifier_id="130" value="4" />
      <Q id="454462015" qualifier_id="59" value="16, 14, 26, 25, 4, 2, 13, 6, 9, 7, 23, 1, 3, 8, 12, 17, 19, 28" />
    </Event>
    <Event id="931188097" event_id="2" type_id="32" period_id="1" min="0" sec="0" team_id="143" outcome="1" x="0.0" y="0.0" timestamp="2016-08-14T14:00:25.556" last_modified="2016-08-14T14:00:26" version="1471179625559">
      <Q id="674324086" qualifier_id="127" value="Right to Left" />
    </Event>
  </Game>
</Games>

亲切的问候

2 个答案:

答案 0 :(得分:1)

这似乎可行:

import xml.etree.ElementTree as ET

xml = '''<Games timestamp="2016-12-02T09:06:51">
  <Game id="853139" away_team_id="143" away_team_name="Lyon" competition_id="24" competition_name="French Ligue 1" game_date="2016-08-14T14:00:00" home_team_id="148" home_team_name="Nancy" matchday="1" period_1_start="2016-08-14T14:00:25" period_2_start="2016-08-14T15:02:29" season_id="2016" season_name="Season 2016/2017">
    <Event id="1195160021" event_id="1" type_id="34" period_id="16" min="0" sec="0" team_id="143" outcome="1" x="0.0" y="0.0" timestamp="2016-08-14T13:08:34.349" last_modified="2016-08-14T13:59:59" version="1471179598746">
      <Q id="1117749718" qualifier_id="194" value="59963" />
      <Q id="1807420796" qualifier_id="30" value="59957, 54772, 37832, 59963, 44488, 52775, 169007, 168568, 59966, 166552, 149519, 220560, 173211, 55305, 107641, 37852, 59956, 71389" />
      <Q id="450557206" qualifier_id="197" value="645" />
      <Q id="1671039854" qualifier_id="131" value="1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="108315093" qualifier_id="227" value="0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="582175015" qualifier_id="44" value="1, 2, 2, 3, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5" />
      <Q id="1069121575" qualifier_id="130" value="4" />
      <Q id="459298302" qualifier_id="59" value="1, 20, 15, 21, 2, 3, 14, 8, 10, 18, 27, 22, 4, 7, 12, 28, 30, 31" />
    </Event>
    <Event id="2066606636" event_id="1" type_id="34" period_id="16" min="0" sec="0" team_id="148" outcome="1" x="0.0" y="0.0" timestamp="2016-08-14T13:08:35.580" last_modified="2016-08-14T15:03:52" version="1471183432594">
      <Q id="891471807" qualifier_id="194" value="171101" />
      <Q id="201984211" qualifier_id="30" value="38816, 80799, 43024, 9980, 170034, 171101, 210460, 214472, 51327, 38008, 97290, 63600, 152337, 209874, 44314, 214473, 93498, 54911" />
      <Q id="478809608" qualifier_id="131" value="1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="974533808" qualifier_id="227" value="0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" />
      <Q id="193300652" qualifier_id="44" value="1, 2, 2, 3, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5" />
      <Q id="1493018979" qualifier_id="130" value="4" />
      <Q id="454462015" qualifier_id="59" value="16, 14, 26, 25, 4, 2, 13, 6, 9, 7, 23, 1, 3, 8, 12, 17, 19, 28" />
    </Event>
    <Event id="931188097" event_id="2" type_id="32" period_id="1" min="0" sec="0" team_id="143" outcome="1" x="0.0" y="0.0" timestamp="2016-08-14T14:00:25.556" last_modified="2016-08-14T14:00:26" version="1471179625559">
      <Q id="674324086" qualifier_id="127" value="Right to Left" />
    </Event>
  </Game>
</Games>'''


root = ET.fromstring(xml)
events = root.findall('.//Event')
for event in events:
    print(event.attrib)

输出

{'min': '0', 'event_id': '1', 'timestamp': '2016-08-14T13:08:34.349', 'type_id': '34', 'period_id': '16', 'team_id': '143', 'version': '1471179598746', 'sec': '0', 'last_modified': '2016-08-14T13:59:59', 'y': '0.0', 'x': '0.0', 'outcome': '1', 'id': '1195160021'}
{'min': '0', 'event_id': '1', 'timestamp': '2016-08-14T13:08:35.580', 'type_id': '34', 'period_id': '16', 'team_id': '148', 'version': '1471183432594', 'sec': '0', 'last_modified': '2016-08-14T15:03:52', 'y': '0.0', 'x': '0.0', 'outcome': '1', 'id': '2066606636'}
{'min': '0', 'event_id': '2', 'timestamp': '2016-08-14T14:00:25.556', 'type_id': '32', 'period_id': '1', 'team_id': '143', 'version': '1471179625559', 'sec': '0', 'last_modified': '2016-08-14T14:00:26', 'y': '0.0', 'x': '0.0', 'outcome': '1', 'id': '931188097'}

答案 1 :(得分:0)

我找到了解决方案。 替换为

temp = record.find(tag.value)

temp = record.attrib[tag.value]

event_data[tag.value].append(temp.text)

event_data[tag.value].append(temp)