我有一个看起来像来自API的xml。
import requests
import pandas as pd
import lxml.etree as et
from lxml import etree
url = 'abc.com'
xml_data1 = requests.get(url).content
print(xml_data1)
xml_data1:
<?xml version="1.0" encoding="utf-8"?>
<Leads>
<Lead Id="123" LeadTitle="test, test., , (123) 456-7890, " CreateDate="01/01/2017 11:11:11" ModifyDate="01/04/2017 03:03:03" ACount="1" LCount="4" RCount="0" ROnly="false" Flagged="false" LastDistributionDate="01/01/2017 10:10:10" LeadFormType="test test">
<Campaign CampaignId="123" CampaignTitle="abc" />
<Status StatusId="123" StatusTitle="test" />
<Agent AgentId="123" AgentName="test, test" AgentEmail="a@a.com">
<AgentCustomFields custom1="test test, test" custom2="test" custom3="" custom4="" />
</Agent>
<Fields>
<Field FieldId="7" Value="a@a.com" FieldTitle="test" FieldType="test" />
<Field FieldId="8" Value="test" FieldTitle="test 1" FieldType="test" />
<Field FieldId="9" Value="test" FieldTitle="City" FieldType="Text" />
<Field FieldId="10" Value="test" FieldTitle="State" FieldType="State" />
<Field FieldId="11" Value="test" FieldTitle="test" FieldType="Zip" />
<Field FieldId="950" Value="test." FieldTitle="Business Name" FieldType="Text" />
<Field FieldId="1261" Value="Intuit Desktop" FieldTitle="test" FieldType="Text" />
<Field FieldId="1262" Value="test" FieldTitle="test" FieldType="Text" />
<Field FieldId="1263" Value="test" FieldTitle="test" FieldType="Number" />
<Field FieldId="1267" Value="test" FieldTitle="test" FieldType="Text" />
<Field FieldId="1310" Value="test" FieldTitle="test" FieldType="Phone" />
<Field FieldId="1319" Value="test" FieldTitle="test" FieldType="Number" />
<Field FieldId="1485" Value="test" FieldTitle="tst" FieldType="State" />
</Fields>
<Logs>
<StatusLog>
<Status LogId="123" LogDate="01/04/2017 03:08:44" StatusId="28" StatusTitle="test" AgentId="19" AgentName="test" AgentEmail="test@test.com" />
</StatusLog>
<ActionLog>
<Action LogId="123" ActionTypeId="73" ActionTypeName="test" MilestoneId="1" ActionDate="01/04/2017 03:08:44" ActionNote="test" AgentId="19" AgentName="test,test" AgentEmail="test@test.com" />
</ActionLog>
<EmailLog>
<Email LogId="123" SendDate="01/01/2017 20:53:39" EmailTemplateId="1" EmailTemplateName="test " AgentId="1" AgentName="test" AgentEmail="test@test.com" />
</EmailLog>
<DistributionLog>
<Distribution LogId="1" LogDate="01/01/2017 10:10:08" DistributionProgramId="1" DistributionProgramName="test" AssignedAgentId="1" AssignedAgentName="test,test" AssignedAgentEmail="test@test.com" />
</DistributionLog>
<CreationLog LogId="1" LogDate="01/01/2017 10:10:05" Imported="true" CreatedByAgentId="1" CreatedByAgentName="test, test" CreatedByAgentEmail="test@test.com" />
</Logs>
</Lead>
</Leads>
您有工作上的顾虑吗,我无法发布整个xml字符串,但是它遵循上面的结构。根据xml验证程序,xml是正确的,但是当我进行另一个API调用并返回另一个xml字符串时,它看起来像这样:
<?xml version="1.0" encoding="utf-8"?>\r\n<Leads>\r\n <Lead Id="123" />\r\n <Lead Id="456" />\r\n</Leads>'
我可以使用以下代码将上述xml成功传递到数据框:
class XML2DataFrame:
def __init__(self, xml_data):
self.root = ET.XML(xml_data)
def parse_root(self, root):
"""Return a list of dictionaries from the text
and attributes of the children under this XML root."""
return [self.parse_element(child) for child in iter(root)]
def parse_element(self, element, parsed=None):
""" Collect {key:attribute} and {tag:text} from thie XML
element and all its children into a single dictionary of strings."""
if parsed is None:
parsed = dict()
for key in element.keys():
if key not in parsed:
parsed[key] = element.attrib.get(key)
else:
raise ValueError('duplicate attribute {0} at element {1}'.format(key, element.getroottree().getpath(element)))
""" Apply recursion"""
for child in list(element):
self.parse_element(child, parsed)
return parsed
def process_data(self):
""" Initiate the root XML, parse it, and return a dataframe"""
structure_data = self.parse_root(self.root)
return pd.DataFrame(structure_data)
xml2df = XML2DataFrame(xml_data)
xml_dataframe = xml2df.process_data()
但是,当我将可能格式错误的xml字符串传递给上述函数时,会出现错误:
AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'getroottree'
由于可能格式错误的xml在同一个标记中具有多个值,因此我认为该函数无法对其进行解析。
我希望将可能格式错误的xml推送到平面数据框中。
从xml编辑输出行列标题:
ActionCount CreateDate Flagged Id LastDistributionDate LeadFormType LeadTitle LogCount FieldId FieldTitle FieldType Value CampaignId CampaignTitle AgentEmail AgentId AgentName LogDate LogId StatusId StatusTitle AssignedAgentEmail AssignedAgentId AssignedAgentName DistributionProgramId DistributionProgramName LogDate LogId
答案 0 :(得分:2)
自从您更新了问题,我决定用新的xml发布另一个答案。
from bs4 import BeautifulSoup
import pandas as pd
xml = """
<?xml version="1.0" encoding="utf-8"?>
<Leads>
<Lead Id="123" LeadTitle="test, test., , (123) 456-7890, " CreateDate="01/01/2017 11:11:11" ModifyDate="01/04/2017 03:03:03" ACount="1" LCount="4" RCount="0" ROnly="false" Flagged="false" LastDistributionDate="01/01/2017 10:10:10" LeadFormType="test test">
<Campaign CampaignId="123" CampaignTitle="abc" />
<Status StatusId="123" StatusTitle="test" />
<Agent AgentId="123" AgentName="test, test" AgentEmail="a@a.com">
<AgentCustomFields custom1="test test, test" custom2="test" custom3="" custom4="" />
</Agent>
<Fields>
<Field FieldId="7" Value="a@a.com" FieldTitle="test" FieldType="test" />
<Field FieldId="8" Value="test" FieldTitle="test 1" FieldType="test" />
<Field FieldId="9" Value="test" FieldTitle="City" FieldType="Text" />
<Field FieldId="10" Value="test" FieldTitle="State" FieldType="State" />
<Field FieldId="11" Value="test" FieldTitle="test" FieldType="Zip" />
<Field FieldId="950" Value="test." FieldTitle="Business Name" FieldType="Text" />
<Field FieldId="1261" Value="Intuit Desktop" FieldTitle="test" FieldType="Text" />
<Field FieldId="1262" Value="test" FieldTitle="test" FieldType="Text" />
<Field FieldId="1263" Value="test" FieldTitle="test" FieldType="Number" />
<Field FieldId="1267" Value="test" FieldTitle="test" FieldType="Text" />
<Field FieldId="1310" Value="test" FieldTitle="test" FieldType="Phone" />
<Field FieldId="1319" Value="test" FieldTitle="test" FieldType="Number" />
<Field FieldId="1485" Value="test" FieldTitle="tst" FieldType="State" />
</Fields>
<Logs>
<StatusLog>
<Status LogId="123" LogDate="01/04/2017 03:08:44" StatusId="28" StatusTitle="test" AgentId="19" AgentName="test" AgentEmail="test@test.com" />
</StatusLog>
<ActionLog>
<Action LogId="123" ActionTypeId="73" ActionTypeName="test" MilestoneId="1" ActionDate="01/04/2017 03:08:44" ActionNote="test" AgentId="19" AgentName="test,test" AgentEmail="test@test.com" />
</ActionLog>
<EmailLog>
<Email LogId="123" SendDate="01/01/2017 20:53:39" EmailTemplateId="1" EmailTemplateName="test " AgentId="1" AgentName="test" AgentEmail="test@test.com" />
</EmailLog>
<DistributionLog>
<Distribution LogId="1" LogDate="01/01/2017 10:10:08" DistributionProgramId="1" DistributionProgramName="test" AssignedAgentId="1" AssignedAgentName="test,test" AssignedAgentEmail="test@test.com" />
</DistributionLog>
<CreationLog LogId="1" LogDate="01/01/2017 10:10:05" Imported="true" CreatedByAgentId="1" CreatedByAgentName="test, test" CreatedByAgentEmail="test@test.com" />
</Logs>
</Lead>
</Leads>
"""
soup = BeautifulSoup(xml, "xml")
# Get Attributes from all nodes
attrs = []
for elm in soup(): # soup() is equivalent to soup.find_all()
attrs.append(elm.attrs)
# Since you want the data in a dataframe, it makes sense for each field to be a new row consisting of all the other node attributes
fields_attribute_list= [x for x in attrs if 'FieldId' in x.keys()]
other_attribute_list = [x for x in attrs if 'FieldId' not in x.keys() and x != {}]
# Make a single dictionary with the attributes of all nodes except for the `Field` nodes.
attribute_dict = {}
for d in other_attribute_list:
for k, v in d.items():
attribute_dict.setdefault(k, v)
# Update each field row with attributes from all other nodes.
full_list = []
for field in fields_attribute_list:
field.update(attribute_dict)
full_list.append(field)
# Make Dataframe
df = pd.DataFrame(full_list)
但是,请注意,此方法会覆盖xml中具有相同名称的属性ID,例如LogId
。无论如何,这段代码应该可以帮助您入门。
答案 1 :(得分:1)
我认为您会发现BeautifulSoup
进行XML / HTML解析要容易得多。它还很好地处理了格式错误的XML和HTML。
pip install beautifulsoup4
以下是如何解析BeautifulSoup提供的xml。
from bs4 import BeautifulSoup
import pandas as pd
xml = """
<?xml version="1.0" encoding="utf-8"?>
<Leads>
<Lead Id="123" LeadTitle="test, test., , (123) 456-7890, " CreateDate="01/01/2017 11:11:11" ModifyDate="01/04/2017 03:03:03" ACount="1" LCount="4" RCount="0" ROnly="false" Flagged="false" LastDistributionDate="01/01/2017 10:10:10" LeadFormType="test test"></Lead>
<Lead Id="123" />
<Lead Id="456" />
</Leads>
"""
soup = BeautifulSoup(xml, "xml")
leads = soup.findAll('Lead')
lead_list = []
for lead in leads:
lead_list.append(lead.attrs)
df = pd.DataFrame(lead_list)
df
输出:
ACount CreateDate Flagged Id LCount LastDistributionDate LeadFormType LeadTitle ModifyDate RCount ROnly
0 1 01/01/2017 11:11:11 false 123 4 01/01/2017 10:10:10 test test test, test., , (123) 456-7890, 01/04/2017 03:03:03 0 false
1 NaN NaN NaN 123 NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN 456 NaN NaN NaN NaN NaN NaN NaN