我正在解析一个包含750万行的巨大XML(700+ Mb)。当我的代码完成运行时,我只获得前160,000行左右的解析信息。程序以代码0结束。 我没有足够的记忆吗?或者它是图书馆的限制(lxml或美丽的汤)?我该如何解决这个问题?
这是我的代码:
from bs4 import BeautifulSoup
import html5lib
import lxml
import csv
import gc
def parse(soup, filename):
scvfile = open(filename, 'w')
writer = csv.writer(scvfile, quoting=csv.QUOTE_ALL)
writer.writerow(['name', 'primary_id', 'drug_interaction_list', 'food_interaction_list', 'dosage_list'])
i = 0
soup.prettify()
drugs = soup.drugbank.findAll(name="drug", recursive=False)
print(len(soup.findAll(name="drugbank")))
print(len(drugs))
for drug in drugs:
#gc.collect()
name = drug.find(name='name', recursive=False).string
primary_id = drug.find(name='drugbank-id', attrs={'primary': 'true'}).string
drug_interactions = drug.find(name='drug-interactions', recursive=False)
drug_interactions = drug_interactions.findAll()
drug_interacts = []
for drug_interaction in drug_interactions:
if not drug_interaction.description is None:
drug_interacts.append({"drugbank_id": drug_interaction.find(name='drugbank-id', recursive=False).string,
"name": drug_interaction.find(name='name', recursive=False).string,
"description": drug_interaction.find(name='description',
recursive=False).string})
food_interactions = drug.find(name='food-interactions', recursive=False)
food_interacts = []
for food_interaction in food_interactions.findAll(name='food-interaction', recursive=False):
food_interacts.append(food_interaction.string)
dosages = drug.dosages
dosage_arr = []
for dosage in dosages.findAll(name='dosage'):
dosage_arr.append({"form": "\n\n" + dosage.form.string, "route": dosage.route.string,
"Strength": dosage.strength.string})
writer.writerow([name, primary_id, drug_interacts, food_interacts, dosage_arr])
i = i + 1
print("Processed " + str(i) + "rows for csv")
soup = BeautifulSoup(open("/Users/kamilsaitov/Desktop/full database.xml"), "lxml")
parse(soup, "DrugDatabase.csv")