我必须将xml文件转换为数据框熊猫。我已经在多种模式下尝试过,但结果是相同的:无,无...我错了吗?另一个图书馆更好吗?是否可能是因为我的XML格式? xml文件的类型为:
<Document xmlns="xxx/zzz/yyy">
<Header>
<DocumentName>GXXXXXXXXXX</DocumentName>
<DocumentType>G10</DocumentType>
<Version>2.0.0.0</Version>
<Created>2018-12-11T09:00:02.987777+00:00</Created>
<TargetProcessingDate>2019-02-11</TargetProcessingDate>
<Part>
<CurrentPage>1</CurrentPage>
<TotalPages>1</TotalPages>
</Part>
</Header>
<Body>
<Accounts>
<Account>
<Type>20WE</Type>
<OldType>19WE</OldType>
<Kids>
<Kid>
<Name>marc</Name>
<BirthDate>2000-02-06</BirthDate>
<Year>19</Year>
<Email>marc@xxx.com</Email>
</Kid>
</Kids>
</Account>
</Accounts>
</Body>
</Document>
import xml.etree.ElementTree as ET
import pandas as pd
class XML2DataFrame:
def __init__(self, xml_data):
self.root = ET.XML(xml_data)
def parse_root(self, root):
"""Return a list of dictionaries from the text and attributes of the
children under this XML root."""
return [parse_element(child) for child in root.getchildren()]
def parse_element(self, element, parsed=None):
""" Collect {key:attribute} and {tag:text} from thie XML
element and all its children into a single dictionary of strings."""
if parsed is None:
parsed = dict()
for key in element.keys():
if key not in parsed:
parsed[key] = element.attrib.get(key)
if element.text:
parsed[element.tag] = element.text
else:
raise ValueError('duplicate attribute {0} at element {1}'.format(key, element.getroottree().getpath (element)))
""" Apply recursion"""
for child in list(element):
self.parse_element(child, parsed)
return parsed
def process_data(self):
""" Initiate the root XML, parse it, and return a dataframe"""
structure_data = self.parse_root(self.root)
return pd.DataFrame(structure_data)
xml2df = XML2DataFrame(xml_data)
xml_dataframe = xml2df.process_data()
Type OldType Name BirthDate Year Email
20WE 19WE marc 2000-02-06 19 marc@xxx.com
答案 0 :(得分:1)
您可以使用xmltodict
将其转换为json,然后进行解析:
import xmltodict
import pandas as pd
with open('file.xml', 'r') as f:
data = xmltodict.parse(f.read())['Document']['Body']['Accounts']['Account']
data_pd = {'Type': [data['Type']],
'OldType': [data['OldType']],
'Name': [data['Kids']['Kid']['Name']],
'BirthDate': [data['Kids']['Kid']['BirthDate']],
'Year': [data['Kids']['Kid']['Year']],
'Email': [data['Kids']['Kid']['Email']]}
df = pd.DataFrame(data_pd)
print(df)
输出:
Type OldType Name BirthDate Year Email
0 20WE 19WE marc 2000-02-06 19 marc@xxx.com
答案 1 :(得分:0)
像 genericRetryStrategy = ({
maxRetryAttempts = 3,
scalingDuration = 1000,
excludedStatusCodes = [],
}: {
maxRetryAttempts?: number;
scalingDuration?: number;
excludedStatusCodes?: HttpStatusCode[];
} = {}) => (attempts: Observable<any>) => {
return attempts.pipe(
mergeMap((error, i) => {
this.notificationService.showNotification('attempting', 'retry', 4000, () => {
const retryAttempt = i++;
// if maximum number of retries have been met
// or response is a status code we don't wish the retry, throw error
if (retryAttempt > maxRetryAttempts || excludedStatusCodes.find(e => e === error.status)) {
return throwError(error);
}
console.log(`Attempt ${retryAttempt}: retrying in ${retryAttempt * scalingDuration}ms`);
//retry after 1s, 2s, etc...
return timer(retryAttempt * scalingDuration);
});
return throwError(error);
}),
finalize(() => console.log('We are done!')),
);
};
(throwError(error))
输出
BeautifulSoup