我有2个非常大的xml文件,它们为同一个地方/建筑物/房间组合保存不同的数据。我目前在第一个大文件上使用python etree parse然后循环遍历它以提取地点/建筑物/房间ID(以及其他信息)然后使用这些id循环遍历第二个大型xml文件(与第一个相同的结构) )我目前正在使用lxml iterparse来查找和提取与第一个文件中特定位置相关的第二个文件中的Place元素。然后它遍历那个place元素以找到相关的数据它工作,但它继续变得越来越慢,因为我进一步循环进入第一个文件。
我已经做了我能清除的所有内容()在第二个大文件的iterparse中没有相关的元素,但是我有5000个位置可以循环,前100个处理非常快(少于一个分钟)然后接下来的400分钟需要30分钟,依此类推。 15个小时后,我在大约4000个设施,移动得非常慢。我怀疑解析其中一个文件的数据太多了。
这是使用通用化xml的简化代码(抱歉,我无法进一步简化)。
largefile1 = "largefile1.xml"
largefile2 = "largeFile2.xml"
ptree = ET.parse (largefile1)
proot = ptree.getroot()
o = open('output.txt', 'w')
def get_place_elem(pplaceid,largefile2):
Placenode = ET.iterparse(Largefile2, events=("end",), tag='Place')
for event, Place in Placenode:
for PlaceId in Place.findall('PlaceIdentification'):
placeid = PlaceId.find('PlaceIdentifier').text
if placeid == pplaceid:
del Placenode
return Place
Place.clear()
while Place.getprevious() is not None:
del Place.getparent()[0]
del Placenode
def getfacdata(pplaceid,pbuildid,proomid,Place):
for Build in Place.findall('Building'):
euid = ' '
for BuildId in Build.findall('BuildingIdentification'):
bid = BuildId.find('Identifier').text
if bid ==pbid:
for Room in Build.findall('Room'):
roomid = ' '
for RoomId in Room.findall('RoomIdentification'):
roomid = RoomId.find('Identifier').text
if roomid == proomid:
...Collect data from Room element...
... do some simple math with if statements
return data; # list of 15 data values
for pPlace in proot.findall('.//Place'):
for pPlaceId in pPlace.findall('PlaceIdentification'):
pplaceid = pPlaceId.find('PlaceIdentifier').text
if placeid == pplaceid:
placecnt += 1
#... get some data
for pBuild in pPlace.findall('Buidling'):
for pBuildId in pBuild.findall('BuildingIdentification'):
pbid = pBuildId.find('Identifier').text
for pRoom in pBuild.findall('Room'):
for pRoomId in pRoom.findall('RoomIdentification'):
proomid = pRoom.find('Identifier').text
if prevpplaceid != pplaceid:
if placecnt != 1:Place.clear()
Place = get_fac_elem(pplaceid,largefile2)
prevpplaceid = pplaceid
data = getfacdata(pplaceid,pbid,proomid,Place)
#...Collect data from Room element...
#... do some simple math with if statements
writer = csv.writer(o)
writer.writerow( ( # data from proom and from 'data' list from processing largefile2 in csv format##))
break
prevpplaceid = pplaceid
o.close()
genericified xml
<Payload>
<Place>
<PlaceName>Place1</PlaceName>
<PlaceStatusCode>OP</PlaceStatusCode>
<PlaceStatusCodeYear>2011</PlaceStatusCodeYear>
<PlaceComment/>
<PlaceIdentification>
<PlaceIdentifier>id001</PlaceIdentifier>
<StateAndCountyFIPSCode>77702</StateAndCountyFIPSCode>
</PlaceIdentification>
<PlaceAddress>
<LocationAddressText>111 Main</LocationAddressText>
<SupplementalLocationText/>
<LocalityName>City1</LocalityName>
<LocationAddressStateCode>State1</LocationAddressStateCode>
<LocationAddressPostalCode>12345</LocationAddressPostalCode>
<LocationAddressCountryCode>USA</LocationAddressCountryCode>
</PlaceAddress>
<PlaceGeographicCoordinates>
<LatitudeMeasure>88.888</LatitudeMeasure>
<LongitudeMeasure>-99.999</LongitudeMeasure>
</PlaceGeographicCoordinates>
<Building>
<BuildingDescription>Building1</BuildingDescription>
<BuildingTypeCode>999</BuildingTypeCode>
<BuildingIdentification>
<Identifier>Building1</Identifier>
</BuildingIdentification>
<Room>
<RoomIdentification>
<Identifier>Room1</Identifier>
</RoomIdentification>
... More data ...
</Room>
<Room>
<RoomIdentification>
<Identifier>Room2</Identifier>
</RoomIdentification>
... More data ...
</Room>
...
</Building>
<Building>
<BuildingDescription>Building2</BuildingDescription>
<BuildingTypeCode>999</BuildingTypeCode>
<BuildingIdentification>
<Identifier>Building2</Identifier>
</BuildingIdentification>
<Room>
<RoomIdentification>
<Identifier>Room1</Identifier>
</RoomIdentification>
... More data ...
</Room>
<Room>
<RoomIdentification>
<Identifier>Room4</Identifier>
</RoomIdentification>
... More data ...
</Room>
...
</Building>
...
</Place>
<Place>
...
</Place>