我有一个9GB XML
文件,通过将其加载到内存中来处理它有点大。我可以使用哪些流读取器/写入器选项?
以下是我正在使用的当前代码:
print ("opening file")
with open('text.xml') as fd:
doc = xmltodict.parse(fd.read())
print ("converting to CSV")
columns = ('EntityType','OrganisationName','AddressLine1','AddressLine2','AddressLine3','PostCode','CompanyID','OrganisationType','OrganisationStatus','OrganisationIndustryCode','DirectorRole','DirectorName')
with open('output.csv', 'wb') as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for x in doc['N8:EntityList']['N8:Entity']:
writer.writerow({'EntityType':x['@xsi:type'].split(':')[1]
,'OrganisationName':x['N2:OrganisationName']['N2:NameElement']['#text'] if isinstance(x['N2:OrganisationName'],(list,)) != True else x['N2:OrganisationName'][0]['N2:NameElement']['#text']
,'AddressLine1':x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine'][0]['#text']
,'AddressLine2':x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine'][1]['#text']
,'AddressLine3':x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine'][2]['#text'] if len(x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine']) > 2 else None
,'PostCode':x['N5:Addresses']['N5:Address'][0]['N6:PostCode']['N6:Identifier']['#text']
,'CompanyID':x['N5:Identifiers']['N5:Identifier']['N5:IdentifierElement'] if isinstance(x['N5:Identifiers']['N5:Identifier'],(list,)) != True else x['N5:Identifiers']['N5:Identifier'][0]['N5:IdentifierElement']
,'OrganisationType':x['N5:OrganisationInfo']['@N5:Type'] if x['N5:OrganisationInfo'].has_key('@N5:Type') else None
,'OrganisationStatus':x['N5:OrganisationInfo']['@N5:Status'] if x['N5:OrganisationInfo'].has_key('@N5:Status') else None
,'OrganisationIndustryCode':x['N5:OrganisationInfo']['@N5:IndustryCode'] if x['N5:OrganisationInfo'].has_key('@N5:IndustryCode') else None
,'DirectorRole':x['N1:Director']['@xsi:type'].split(':')[1] if isinstance(x['N1:Director'],(list,)) != True else x['N1:Director'][0]['@xsi:type'].split(':')[1]
,'DirectorName':x['N1:Director']['N2:PersonName']['N2:NameElement'][0]['#text'] if isinstance(x['N1:Director'],(list,)) != True else x['N1:Director'][0]['N2:PersonName']['N2:NameElement'][0]['#text'] + " " + x['N1:Director']['N2:PersonName']['N2:NameElement'][1]['#text'] if isinstance(x['N1:Director'],(list,)) != True else x['N1:Director'][0]['N2:PersonName']['N2:NameElement'][1]['#text']
})
xml.etree.cElementTree
是一个选项吗?
XML树非常庞大,但这就是它的样子。我只需要储存一些这些。我知道我需要迭代每一行并有一个计数器或其他东西。是否有任何关于我可以看到的不同XML文档的例子?
N8:Entity
|-- N1:Director: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- N2:PersonName: struct (nullable = true)
| | | |-- N2:NameElement: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- _ElementType: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- _Type: string (nullable = true)
| | |-- N9:Asic: struct (nullable = true)
| | | |-- N2:OrganisationName: struct (nullable = true)
| | | | |-- N2:NameElement: struct (nullable = true)
| | | | | |-- _ElementType: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- N5:Identifiers: struct (nullable = true)
| | | | |-- N5:Identifier: struct (nullable = true)
| | | | | |-- N5:IdentifierElement: long (nullable = true)
| | | | | |-- N5:IssuerName: struct (nullable = true)
| | | | | | |-- N2:NameElement: string (nullable = true)
| | | | | |-- _Type: string (nullable = true)
| | | |-- N9:RegisteredOfficeAddress: struct (nullable = true)
| | | | |-- N6:Country: struct (nullable = true)
| | | | | |-- N6:NameElement: struct (nullable = true)
| | | | | | |-- _Abbreviation: boolean (nullable = true)
| | | | | | |-- _NameCode: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- N6:FreeTextAddress: struct (nullable = true)
| | | | | |-- N6:AddressLine: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- _Type: string (nullable = true)
| | | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- N6:PostCode: struct (nullable = true)
| | | | | |-- N6:Identifier: struct (nullable = true)
| | | | | | |-- _Type: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _Usage: string (nullable = true)
| | |-- N9:Status: string (nullable = true)
| | |-- _DateValidFrom: string (nullable = true)
| | |-- _DateValidTo: string (nullable = true)
| | |-- _type: string (nullable = true)
|-- N2:OrganisationName: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- N2:NameElement: struct (nullable = true)
| | | |-- _ElementType: string (nullable = true)
| | | |-- _VALUE: string (nullable = true)
| | |-- _Type: string (nullable = true)
|-- N5:Addresses: struct (nullable = true)
| |-- N5:Address: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- N6:Country: struct (nullable = true)
| | | | |-- N6:NameElement: struct (nullable = true)
| | | | | |-- _Abbreviation: boolean (nullable = true)
| | | | | |-- _NameCode: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- N6:FreeTextAddress: struct (nullable = true)
| | | | |-- N6:AddressLine: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- _Type: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | |-- N6:PostCode: struct (nullable = true)
| | | | |-- N6:Identifier: struct (nullable = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- _AddressID: long (nullable = true)
| | | |-- _AddressIDType: string (nullable = true)
| | | |-- _DateValidFrom: string (nullable = true)
| | | |-- _DateValidTo: string (nullable = true)
| | | |-- _Type: string (nullable = true)
| | | |-- _Usage: string (nullable = true)
|-- N5:ContactNumbers: struct (nullable = true)
| |-- N5:ContactNumber: struct (nullable = true)
| | |-- N5:ContactNumberElement: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- _CommunicationMediaType: string (nullable = true)
| | |-- _Usage: string (nullable = true)
|-- N5:ElectronicAddressIdentifiers: struct (nullable = true)
| |-- N5:ElectronicAddressIdentifier: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- _Type: string (nullable = true)
| | | |-- _VALUE: string (nullable = true)
|-- N5:Events: struct (nullable = true)
| |-- N5:Event: struct (nullable = true)
| | |-- _Type: string (nullable = true)
| | |-- _VALUE: string (nullable = true)
|-- N5:Identifiers: struct (nullable = true)
| |-- N5:Identifier: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- N5:IdentifierElement: long (nullable = true)
| | | |-- N5:IssuerName: struct (nullable = true)
| | | | |-- N2:NameElement: string (nullable = true)
| | | |-- _Type: string (nullable = true)
|-- N5:OrganisationInfo: struct (nullable = true)
| |-- _CountryOfOrigin: string (nullable = true)
| |-- _IndustryCode: string (nullable = true)
| |-- _Status: string (nullable = true)
| |-- _Type: string (nullable = true)
| |-- _VALUE: string (nullable = true)
|-- N9:AddressForRecords: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- N6:Country: struct (nullable = true)
| | | |-- N6:NameElement: struct (nullable = true)
| | | | |-- _Abbreviation: boolean (nullable = true)
| | | | |-- _NameCode: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- N6:FreeTextAddress: struct (nullable = true)
| | | |-- N6:AddressLine: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | |-- N6:PostCode: struct (nullable = true)
| | | |-- N6:Identifier: struct (nullable = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- _DateValidFrom: string (nullable = true)
| | |-- _DateValidTo: string (nullable = true)
|-- N9:AnnualReturnFilingMonth: long (nullable = true)
|-- N9:FinancialReportingFilingMonth: long (nullable = true)
|-- N9:HasConstitutionFiled: boolean (nullable = true)
|-- N9:InsolvencyDetails: struct (nullable = true)
| |-- N9:Appointee: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- N2:OrganisationName: struct (nullable = true)
| | | | |-- N2:NameElement: struct (nullable = true)
| | | | | |-- _ElementType: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- N2:PersonName: struct (nullable = true)
| | | | |-- N2:NameElement: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- _ElementType: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | |-- N5:ElectronicAddressIdentifiers: struct (nullable = true)
| | | | |-- N5:ElectronicAddressIdentifier: struct (nullable = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- N9:PhysicalAddress: struct (nullable = true)
| | | | |-- N6:Country: struct (nullable = true)
| | | | | |-- N6:NameElement: struct (nullable = true)
| | | | | | |-- _Abbreviation: boolean (nullable = true)
| | | | | | |-- _NameCode: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- N6:FreeTextAddress: struct (nullable = true)
| | | | | |-- N6:AddressLine: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- _Type: string (nullable = true)
| | | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- N6:PostCode: struct (nullable = true)
| | | | | |-- N6:Identifier: struct (nullable = true)
| | | | | | |-- _Type: string (nullable = true)
| | | | | | |-- _VALUE: long (nullable = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _Usage: string (nullable = true)
| | | |-- _DateValidFrom: string (nullable = true)
| | | |-- _DateValidTo: string (nullable = true)
|-- N9:PersonAuthorisedForService: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- N2:PersonName: struct (nullable = true)
| | | |-- N2:NameElement: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- _ElementType: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- _Type: string (nullable = true)
| | |-- N9:Address: struct (nullable = true)
| | | |-- N6:Country: struct (nullable = true)
| | | | |-- N6:NameElement: struct (nullable = true)
| | | | | |-- _Abbreviation: boolean (nullable = true)
| | | | | |-- _NameCode: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- N6:FreeTextAddress: struct (nullable = true)
| | | | |-- N6:AddressLine: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- _Type: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | |-- N6:PostCode: struct (nullable = true)
| | | | |-- N6:Identifier: struct (nullable = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | | |-- _Type: string (nullable = true)
| | | |-- _Usage: string (nullable = true)
| | |-- _DateValidFrom: string (nullable = true)
| | |-- _DateValidTo: string (nullable = true)
|-- N9:PreviousCompanyName: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _DateValidFrom: string (nullable = true)
| | |-- _DateValidTo: string (nullable = true)
| | |-- _VALUE: string (nullable = true)
|-- N9:PreviousCompanyStatus: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _DateValidFrom: string (nullable = true)
| | |-- _DateValidTo: string (nullable = true)
| | |-- _VALUE: string (nullable = true)
|-- N9:ShareRegisterAddress: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- N6:Country: struct (nullable = true)
| | | |-- N6:NameElement: struct (nullable = true)
| | | | |-- _Abbreviation: boolean (nullable = true)
| | | | |-- _NameCode: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- N6:FreeTextAddress: struct (nullable = true)
| | | |-- N6:AddressLine: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | |-- N6:PostCode: struct (nullable = true)
| | | |-- N6:Identifier: struct (nullable = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- _DateValidFrom: string (nullable = true)
| | |-- _DateValidTo: string (nullable = true)
|-- N9:Shareholding: struct (nullable = true)
| |-- N9:ExtensiveShareholding: boolean (nullable = true)
| |-- N9:ShareAllocation: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- N9:Allocation: long (nullable = true)
| | | |-- N9:Shareholder: struct (nullable = true)
| | | | |-- N2:OrganisationName: struct (nullable = true)
| | | | | |-- N2:NameElement: struct (nullable = true)
| | | | | | |-- _ElementType: string (nullable = true)
| | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- N2:PersonName: struct (nullable = true)
| | | | | |-- N2:NameElement: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- _ElementType: string (nullable = true)
| | | | | | | |-- _VALUE: string (nullable = true)
| | | | |-- N5:Identifiers: struct (nullable = true)
| | | | | |-- N5:Identifier: struct (nullable = true)
| | | | | | |-- N5:IdentifierElement: long (nullable = true)
| | | | | | |-- N5:IssuerName: struct (nullable = true)
| | | | | | | |-- N2:NameElement: string (nullable = true)
| | | | | | |-- _Type: string (nullable = true)
| | | | |-- N9:PhysicalAddress: struct (nullable = true)
| | | | | |-- N6:Country: struct (nullable = true)
| | | | | | |-- N6:NameElement: struct (nullable = true)
| | | | | | | |-- _Abbreviation: boolean (nullable = true)
| | | | | | | |-- _NameCode: string (nullable = true)
| | | | | | | |-- _VALUE: string (nullable = true)
| | | | | |-- N6:FreeTextAddress: struct (nullable = true)
| | | | | | |-- N6:AddressLine: array (nullable = true)
| | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | |-- _Type: string (nullable = true)
| | | | | | | | |-- _VALUE: string (nullable = true)
| | | | | |-- N6:PostCode: struct (nullable = true)
| | | | | | |-- N6:Identifier: struct (nullable = true)
| | | | | | | |-- _Type: string (nullable = true)
| | | | | | | |-- _VALUE: string (nullable = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _Usage: string (nullable = true)
| | | | |-- _DateValidTo: string (nullable = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _type: string (nullable = true)
| |-- N9:TotalNumberOfShares: long (nullable = true)
|-- N9:UltimateHoldingCompany: struct (nullable = true)
| |-- N2:OrganisationName: struct (nullable = true)
| | |-- N2:NameElement: struct (nullable = true)
| | | |-- _ElementType: string (nullable = true)
| | | |-- _VALUE: string (nullable = true)
| |-- N5:Identifiers: struct (nullable = true)
| | |-- N5:Identifier: struct (nullable = true)
| | | |-- N5:IdentifierElement: string (nullable = true)
| | | |-- N5:IssuerName: struct (nullable = true)
| | | | |-- N2:NameElement: string (nullable = true)
| | | |-- _Type: string (nullable = true)
| |-- N9:Address: struct (nullable = true)
| | |-- N6:Country: struct (nullable = true)
| | | |-- N6:NameElement: struct (nullable = true)
| | | | |-- _Abbreviation: boolean (nullable = true)
| | | | |-- _NameCode: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- N6:FreeTextAddress: struct (nullable = true)
| | | |-- N6:AddressLine: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- _Type: string (nullable = true)
| | | | | |-- _VALUE: string (nullable = true)
| | |-- N6:PostCode: struct (nullable = true)
| | | |-- N6:Identifier: struct (nullable = true)
| | | | |-- _Type: string (nullable = true)
| | | | |-- _VALUE: string (nullable = true)
| | |-- _AddressID: long (nullable = true)
| | |-- _AddressIDType: string (nullable = true)
| | |-- _Type: string (nullable = true)
| | |-- _Usage: string (nullable = true)
| |-- N9:CountryOfOrigin: string (nullable = true)
| |-- _PartyID: long (nullable = true)
| |-- _PartyIDType: string (nullable = true)
| |-- _Type: string (nullable = true)
| |-- _VALUE: string (nullable = true)
|-- _N8: string (nullable = true)
|-- _PartyID: long (nullable = true)
|-- _PartyIDType: string (nullable = true)
|-- _type: string (nullable = true)
|-- _xmlns: string (nullable = true)
|-- _xsi: string (nullable = true)