使用Python流式读取器/写入器选项

时间:2018-04-10 23:08:15

标签: python xml python-2.7

我有一个9GB XML文件,通过将其加载到内存中来处理它有点大。我可以使用哪些流读取器/写入器选项?

以下是我正在使用的当前代码:

print ("opening file")
with open('text.xml') as fd:
    doc = xmltodict.parse(fd.read())

print ("converting to CSV")
columns = ('EntityType','OrganisationName','AddressLine1','AddressLine2','AddressLine3','PostCode','CompanyID','OrganisationType','OrganisationStatus','OrganisationIndustryCode','DirectorRole','DirectorName')

with open('output.csv', 'wb') as f:
    writer = csv.DictWriter(f, fieldnames=columns)
    writer.writeheader()

    for x in doc['N8:EntityList']['N8:Entity']:
        writer.writerow({'EntityType':x['@xsi:type'].split(':')[1]
                         ,'OrganisationName':x['N2:OrganisationName']['N2:NameElement']['#text'] if isinstance(x['N2:OrganisationName'],(list,)) != True else x['N2:OrganisationName'][0]['N2:NameElement']['#text']
                         ,'AddressLine1':x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine'][0]['#text']
                         ,'AddressLine2':x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine'][1]['#text']
                         ,'AddressLine3':x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine'][2]['#text'] if len(x['N5:Addresses']['N5:Address'][0]['N6:FreeTextAddress']['N6:AddressLine']) > 2 else None
                         ,'PostCode':x['N5:Addresses']['N5:Address'][0]['N6:PostCode']['N6:Identifier']['#text']
                         ,'CompanyID':x['N5:Identifiers']['N5:Identifier']['N5:IdentifierElement'] if isinstance(x['N5:Identifiers']['N5:Identifier'],(list,)) != True else x['N5:Identifiers']['N5:Identifier'][0]['N5:IdentifierElement']
                         ,'OrganisationType':x['N5:OrganisationInfo']['@N5:Type'] if x['N5:OrganisationInfo'].has_key('@N5:Type') else None
                         ,'OrganisationStatus':x['N5:OrganisationInfo']['@N5:Status'] if x['N5:OrganisationInfo'].has_key('@N5:Status') else None
                         ,'OrganisationIndustryCode':x['N5:OrganisationInfo']['@N5:IndustryCode'] if x['N5:OrganisationInfo'].has_key('@N5:IndustryCode') else None
                         ,'DirectorRole':x['N1:Director']['@xsi:type'].split(':')[1] if isinstance(x['N1:Director'],(list,)) != True else x['N1:Director'][0]['@xsi:type'].split(':')[1]
                         ,'DirectorName':x['N1:Director']['N2:PersonName']['N2:NameElement'][0]['#text'] if isinstance(x['N1:Director'],(list,)) != True else x['N1:Director'][0]['N2:PersonName']['N2:NameElement'][0]['#text'] + " " + x['N1:Director']['N2:PersonName']['N2:NameElement'][1]['#text'] if isinstance(x['N1:Director'],(list,)) != True else x['N1:Director'][0]['N2:PersonName']['N2:NameElement'][1]['#text']
                         })

xml.etree.cElementTree是一个选项吗?

XML树非常庞大,但这就是它的样子。我只需要储存一些这些。我知道我需要迭代每一行并有一个计数器或其他东西。是否有任何关于我可以看到的不同XML文档的例子?

N8:Entity
 |-- N1:Director: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- N2:PersonName: struct (nullable = true)
 |    |    |    |-- N2:NameElement: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _Type: string (nullable = true)
 |    |    |-- N9:Asic: struct (nullable = true)
 |    |    |    |-- N2:OrganisationName: struct (nullable = true)
 |    |    |    |    |-- N2:NameElement: struct (nullable = true)
 |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N5:Identifiers: struct (nullable = true)
 |    |    |    |    |-- N5:Identifier: struct (nullable = true)
 |    |    |    |    |    |-- N5:IdentifierElement: long (nullable = true)
 |    |    |    |    |    |-- N5:IssuerName: struct (nullable = true)
 |    |    |    |    |    |    |-- N2:NameElement: string (nullable = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |-- N9:RegisteredOfficeAddress: struct (nullable = true)
 |    |    |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _Usage: string (nullable = true)
 |    |    |-- N9:Status: string (nullable = true)
 |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |-- _DateValidTo: string (nullable = true)
 |    |    |-- _type: string (nullable = true)
 |-- N2:OrganisationName: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- N2:NameElement: struct (nullable = true)
 |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _Type: string (nullable = true)
 |-- N5:Addresses: struct (nullable = true)
 |    |-- N5:Address: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _AddressID: long (nullable = true)
 |    |    |    |-- _AddressIDType: string (nullable = true)
 |    |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |    |-- _DateValidTo: string (nullable = true)
 |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |-- _Usage: string (nullable = true)
 |-- N5:ContactNumbers: struct (nullable = true)
 |    |-- N5:ContactNumber: struct (nullable = true)
 |    |    |-- N5:ContactNumberElement: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _CommunicationMediaType: string (nullable = true)
 |    |    |-- _Usage: string (nullable = true)
 |-- N5:ElectronicAddressIdentifiers: struct (nullable = true)
 |    |-- N5:ElectronicAddressIdentifier: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |-- N5:Events: struct (nullable = true)
 |    |-- N5:Event: struct (nullable = true)
 |    |    |-- _Type: string (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |-- N5:Identifiers: struct (nullable = true)
 |    |-- N5:Identifier: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- N5:IdentifierElement: long (nullable = true)
 |    |    |    |-- N5:IssuerName: struct (nullable = true)
 |    |    |    |    |-- N2:NameElement: string (nullable = true)
 |    |    |    |-- _Type: string (nullable = true)
 |-- N5:OrganisationInfo: struct (nullable = true)
 |    |-- _CountryOfOrigin: string (nullable = true)
 |    |-- _IndustryCode: string (nullable = true)
 |    |-- _Status: string (nullable = true)
 |    |-- _Type: string (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |-- N9:AddressForRecords: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |-- _DateValidTo: string (nullable = true)
 |-- N9:AnnualReturnFilingMonth: long (nullable = true)
 |-- N9:FinancialReportingFilingMonth: long (nullable = true)
 |-- N9:HasConstitutionFiled: boolean (nullable = true)
 |-- N9:InsolvencyDetails: struct (nullable = true)
 |    |-- N9:Appointee: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- N2:OrganisationName: struct (nullable = true)
 |    |    |    |    |-- N2:NameElement: struct (nullable = true)
 |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N2:PersonName: struct (nullable = true)
 |    |    |    |    |-- N2:NameElement: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N5:ElectronicAddressIdentifiers: struct (nullable = true)
 |    |    |    |    |-- N5:ElectronicAddressIdentifier: struct (nullable = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N9:PhysicalAddress: struct (nullable = true)
 |    |    |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: long (nullable = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _Usage: string (nullable = true)
 |    |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |    |-- _DateValidTo: string (nullable = true)
 |-- N9:PersonAuthorisedForService: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- N2:PersonName: struct (nullable = true)
 |    |    |    |-- N2:NameElement: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _Type: string (nullable = true)
 |    |    |-- N9:Address: struct (nullable = true)
 |    |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |-- _Usage: string (nullable = true)
 |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |-- _DateValidTo: string (nullable = true)
 |-- N9:PreviousCompanyName: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |-- _DateValidTo: string (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |-- N9:PreviousCompanyStatus: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |-- _DateValidTo: string (nullable = true)
 |    |    |-- _VALUE: string (nullable = true)
 |-- N9:ShareRegisterAddress: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _DateValidFrom: string (nullable = true)
 |    |    |-- _DateValidTo: string (nullable = true)
 |-- N9:Shareholding: struct (nullable = true)
 |    |-- N9:ExtensiveShareholding: boolean (nullable = true)
 |    |-- N9:ShareAllocation: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- N9:Allocation: long (nullable = true)
 |    |    |    |-- N9:Shareholder: struct (nullable = true)
 |    |    |    |    |-- N2:OrganisationName: struct (nullable = true)
 |    |    |    |    |    |-- N2:NameElement: struct (nullable = true)
 |    |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- N2:PersonName: struct (nullable = true)
 |    |    |    |    |    |-- N2:NameElement: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |-- N5:Identifiers: struct (nullable = true)
 |    |    |    |    |    |-- N5:Identifier: struct (nullable = true)
 |    |    |    |    |    |    |-- N5:IdentifierElement: long (nullable = true)
 |    |    |    |    |    |    |-- N5:IssuerName: struct (nullable = true)
 |    |    |    |    |    |    |    |-- N2:NameElement: string (nullable = true)
 |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- N9:PhysicalAddress: struct (nullable = true)
 |    |    |    |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _Usage: string (nullable = true)
 |    |    |    |    |-- _DateValidTo: string (nullable = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _type: string (nullable = true)
 |    |-- N9:TotalNumberOfShares: long (nullable = true)
 |-- N9:UltimateHoldingCompany: struct (nullable = true)
 |    |-- N2:OrganisationName: struct (nullable = true)
 |    |    |-- N2:NameElement: struct (nullable = true)
 |    |    |    |-- _ElementType: string (nullable = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |-- N5:Identifiers: struct (nullable = true)
 |    |    |-- N5:Identifier: struct (nullable = true)
 |    |    |    |-- N5:IdentifierElement: string (nullable = true)
 |    |    |    |-- N5:IssuerName: struct (nullable = true)
 |    |    |    |    |-- N2:NameElement: string (nullable = true)
 |    |    |    |-- _Type: string (nullable = true)
 |    |-- N9:Address: struct (nullable = true)
 |    |    |-- N6:Country: struct (nullable = true)
 |    |    |    |-- N6:NameElement: struct (nullable = true)
 |    |    |    |    |-- _Abbreviation: boolean (nullable = true)
 |    |    |    |    |-- _NameCode: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- N6:FreeTextAddress: struct (nullable = true)
 |    |    |    |-- N6:AddressLine: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- N6:PostCode: struct (nullable = true)
 |    |    |    |-- N6:Identifier: struct (nullable = true)
 |    |    |    |    |-- _Type: string (nullable = true)
 |    |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _AddressID: long (nullable = true)
 |    |    |-- _AddressIDType: string (nullable = true)
 |    |    |-- _Type: string (nullable = true)
 |    |    |-- _Usage: string (nullable = true)
 |    |-- N9:CountryOfOrigin: string (nullable = true)
 |    |-- _PartyID: long (nullable = true)
 |    |-- _PartyIDType: string (nullable = true)
 |    |-- _Type: string (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |-- _N8: string (nullable = true)
 |-- _PartyID: long (nullable = true)
 |-- _PartyIDType: string (nullable = true)
 |-- _type: string (nullable = true)
 |-- _xmlns: string (nullable = true)
 |-- _xsi: string (nullable = true)

0 个答案:

没有答案