我编写了下面的代码来解析this XML文件。你可以看到它仍然有点凌乱,但我对其中的大部分都是正确的。
你可以看到我坚持的一个部分是'目标'部分(我已经在这里留下了我在本节中尝试使用三引号的代码,但是您可以看到该部分无效)。
我想知道是否有人可以帮我告诉我哪里出错/如何解析目标部分?如果你看一下XML文件here的HTML,我基本上只想提取目标部分中的信息,对于每个基因/条目(或者如果可能的话,目标中似乎有更多信息) XML文件的一部分,所以如果我可以采取那个)?
由于
import requests
import xml.etree.ElementTree as ET
import urllib2
#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
# input.write(response.content)
tree = ET.parse('output.txt')
root = tree.getroot()
val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
for i in key_list:
for child in root.getchildren():
key_dict[i] = child.find(val(i)).text.encode('utf-8')
#print key_dict
def method1(str_name,list_name):
if subnode.tag == str_name:
list_name = []
for i in subnode:
list_name.append(i.text)
return list_name
def method2(list1_name,list2_name,list3_name,list4_name):
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
for a in i:
if a.tag == list3_name:
for u in a:
if u.tag == list4_name:
yield u.text
def method3(list1_name, list2_name):
list_of_tuples = []
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
temp_list = []
for a in i:
temp_list.append(a.text)
list_of_tuples.append(temp_list)
return list_of_tuples
alternative_parents = []
substituents = []
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
for node in root:
for subnode in node:
print method1('{http://www.drugbank.ca}groups','group_list')
print method1('{http://www.drugbank.ca}synonyms','synonym_list')
print method1('{http://www.drugbank.ca}patent','patent_list')
print method2('{http://www.drugbank.ca}general-references','{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')#
if subnode.tag == '{http://www.drugbank.ca}classification':
for each_item in list_to_run_thru:
for i in subnode:
if i.tag == ap_sub(each_item):
print i.text
if i.tag == '{http://www.drugbank.ca}alternative-parent':
alternative_parents.append(i.text)
if i.tag == '{http://www.drugbank.ca}substituent':
substituents.append(i.text)
print method3('{http://www.drugbank.ca}salts','{http://www.drugbank.ca}salt')
print method3('{http://www.drugbank.ca}products','{http://www.drugbank.ca}product')
print method3('{http://www.drugbank.ca}mixtures','{http://www.drugbank.ca}mixture')
print method3('{http://www.drugbank.ca}packagers','{http://www.drugbank.ca}packager')
print method3('{http://www.drugbank.ca}categories','{http://www.drugbank.ca}category')
print method3('{http://www.drugbank.ca}dosages','{http://www.drugbank.ca}dosage')
print method3('{http://www.drugbank.ca}atc-codes','{http://www.drugbank.ca}atc-code')
print method3('{http://www.drugbank.ca}ahfs-codes','{http://www.drugbank.ca}ahfs-code')
print method3('{http://www.drugbank.ca}pdb-entries','{http://www.drugbank.ca}pdb-entry')
print method3('{http://www.drugbank.ca}food-interactions','{http://www.drugbank.ca}food-interaction')
print method3('{http://www.drugbank.ca}drug-interactions','{http://www.drugbank.ca}drug-interaction')
print method3('{http://www.drugbank.ca}calculated-properties','{http://www.drugbank.ca}property')
print method3('{http://www.drugbank.ca}external-identifiers','{http://www.drugbank.ca}external-identifier')
print method3('{http://www.drugbank.ca}external-links','{http://www.drugbank.ca}external-link')
print method3('{http://www.drugbank.ca}snp-adverse-drug-reactions','{http://www.drugbank.ca}reaction')
print substituents
print alternative_parents
'''
if subnode.tag == '{http://www.drugbank.ca}pathways':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}pathway':
for a in i:
print a.text
for u in a:
if u.tag == '{http://www.drugbank.ca}drug':
for x in u:
print x.text
#missing a bit of data here
if subnode.tag == '{http://www.drugbank.ca}targets':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}target':
print i.text
for a in i:
print a.text
if a.tag == '{http://www.drugbank.ca}actions':
for u in a:
print u.text
if a.tag == '{http://www.drugbank.ca}references':
for u in a:
if u.tag == '{http://www.drugbank.ca}articles':
for x in u:
if x.tag == '{http://www.drugbank.ca}article':
for z in x:
print z.text
'''
答案 0 :(得分:1)
我使用BeautifulSoup进行解析,因为它是一个简单的库。
代码:
a 5
b 8
输出:
import pprint
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.drugbank.ca/drugs/DB01048#BE0004136').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')
t = {}
for target in targets:
k = []
v = []
for property in target.find_all('dt'):
k.append(property.get_text())
for property in target.find_all('dd'):
v.append(property.get_text())
t[target.find('strong').get_text()] = dict(zip(k, v))
pprint.pprint(t)