我将成千上万的XML文件解析成字典,并将其结构存储在JSON中。
它们具有几乎相同的结构,但是存在未知数量的不同标签命名方案。在这成千上万个文件中,标签的命名存在各种不同的缩写。
我需要找出描述每个信息的标记有多少种,以便正确解析所有信息。
为此,我想创建一个XML /字典的主字典,其中包括标记名称的所有变体,最好包括成千上万个XML /字典中的计数。
下面是其中一个词典的一小部分:
{
"Header": {
"Ts": {},
"PeriodEndDt": {},
"PreparedBy": {
"PreparerID": {},
"PreparerFirmName": {
"BusinessNameLine1Txt": {}
},
"PreparerAddress": {
"AddLn1Txt": {},
"CityName": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormTypeCd": {},
"PeriodBeginDt": {},
"Filer": {
"UniqueID": {},
"BusinessName": {
"BusinessNameLine1Txt": {}
},
"BusinessNameControlTxt": {},
"PhoneNum": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormData": {
"FormCodeType": {
"BizType": {},
"AssetsAtEOY": {},
"AccountingMethod": {},
"RevenueAndExpenses": {
"ScheduleBNotReqd": {},
"DivsRevAndExpenses": {},
"DivsNetInvstIncomeAmt": {},
"NetGainSaleAstRevAndExpnssAmt": {},
"RevsOvrExpenses": {},
"NetInvestmentIncomeAmt": {}
},
"BalanceSheetGroup": {
"CashInvstBOYAmt": {},
"CashInvstEOYAmt": {},
"CashInvstEOYFMVAmt": {},
"OtherInvestmentsBOYAmt": {},
"OtherInvestmentsEOYAmt": {},
"CapitalStockEOYAmt": {},
"TotalLiabilitiesNetAstEOYAmt": {}
},
"ChangeNetAssetsFundGroup": {
"NetAssettFundBalancesBOYAmt": {},
"ExcessRevExpensesAmt": {},
"OtherIncreasesAmt": {},
"SubtotalAmt": {},
"OtherDecreasesAmt": {},
"TotNetAstOrFundBalancesEOYAmt": {}
},
"CapGainsLossTxInvstIncmDetail": {
"CapGainsLossTxInvstIncmGrp": {
"PropertyDesc": {},
"HowAcquiredCd": {},
"GrossSalesPriceAmt": {},
"GainOrLossAmt": {},
"GainsMinusExcessOrLossesAmt": {}
},
"StatementsRegardingActyGrp": {
"LegislativePoliticalActyInd": {},
"MoreThan100SpentInd": {}
},
"PhoneNum": {},
"LocationOfBooksUSAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"CorporateDirectorsGrp": {
"DirectorsGrp": {
"PersonNm": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"EmpPrograms": {
"EmployeeBenefitGroupNum": {},
"GroupType": {
"GroupElement": {},
"GroupCharacter": {
"GroupNames": {}
}
}
},
"EmpOffice1": {},
"EmpOffice2": {},
"EmpOffice3": {},
"EmpOffice4": {}
}
}
}
}
}
}
}
我首先用于创建字典/ JSON的代码如下:
import xml.etree.ElementTree as ET
strip_ns = lambda xx: str(xx).split('}', 1)[1]
tree = ET.parse('xmlpath.xml')
root = tree.getroot()
tierdict = {}
for tier1 in root:
tier1var = strip_ns(tier1.tag)
tierdict[tier1var] = {}
for tier2 in tier1:
tier2var = strip_ns(tier2.tag)
tierdict[tier1var][tier2var] = {}
for tier3 in tier2:
tier3var = strip_ns(tier3.tag)
tierdict[tier1var][tier2var][tier3var] = {}
for tier4 in tier3:
tier4var = strip_ns(tier4.tag)
tierdict[tier1var][tier2var][tier3var][tier4var] = {}
我想看到的输出是这样的:
{
"Header": {
"Header.Count": 5672,
"Ts": {
"Ts.Count": 3365
},
"Ss": {
"Ss.Count": 2328
},
答案 0 :(得分:1)
我可能会按照以下定义对您想要的元素进行递归搜索:
def get_elements(json_entry, child_elements=[]):
if not child_elements:
return json_entry
el, other_children = child_elements[0], child_elements[1:]
children = el.getchildren()
rec = json_entry.get(el.tag)
if not children:
json_entry[el.tag] = {"Count": rec.get("Count",0)+1 if rec else 1}
else:
json_entry[el.tag] = {"Count": rec.get("Count",0) if rec else 1,
**get_elements({}, children)}
return get_elements(json_entry, other_children)
通过这种方式,您只需传递xml的根元素即可:
from lxml import etree
with open("myxml.xml", "r") as fh:
tree = etree.parse(fh)
root = tree.getroot()
root_children = root.getchildren()
child_recs = get_elements({}, root_children)
{'tagOne': {'Count': 1}, 'tagTwo': {'Count': 1, 'tagThree': {'Count': 1}, 'tagFour': {'Count': 1, 'tagFive': {'Count': 1}}}}
如果要在其周围包裹根元素,请按照以下步骤操作:
master_lookup = {root.tag: {"Count": 1, **child_recs}}
可以很容易地通过许多文件将其扩展到for
循环
master_lookup = {}
for file in os.walk(path):
with open(file) as fh:
tree = etree.parse(fh)
root = tree.getroot()
root_entry = master_lookup.get(root.tag, {"Count": 0})
root_children = root.getchildren()
root_count = root_entry.pop("Count")
master_lookup[root.tag] = {"Count": root_count, **get_elements({**root_entry}, root_children)}
某种效果