难以使用ElementTree解析XML文件的一部分

时间:2018-04-18 16:20:26

标签: python xml web-scraping elementtree

我编写了下面的代码来解析this XML文件。你可以看到它仍然有点凌乱,但我对其中的大部分都是正确的。

你可以看到我坚持的一个部分是'目标'部分(我已经在这里留下了我在本节中尝试使用三引号的代码,但是您可以看到该部分无效)。

我想知道是否有人可以帮我告诉我哪里出错/如何解析目标部分?如果你看一下XML文件here的HTML,我基本上只想提取目标部分中的信息,对于每个基因/条目(或者如果可能的话,目标中似乎有更多信息) XML文件的一部分,所以如果我可以采取那个)?

由于

import requests
import xml.etree.ElementTree as ET
import urllib2

#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
#          input.write(response.content)


tree = ET.parse('output.txt')
root = tree.getroot()

val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
for i in key_list:
     for child in root.getchildren():
          key_dict[i] = child.find(val(i)).text.encode('utf-8')
#print key_dict

def method1(str_name,list_name):
     if subnode.tag == str_name:
          list_name = []
          for i in subnode:
               list_name.append(i.text)
          return list_name


def method2(list1_name,list2_name,list3_name,list4_name):
        if subnode.tag == list1_name:
             for i in subnode:
                  if i.tag == list2_name:
                       for a in i:
                            if a.tag == list3_name:
                                 for u in a:
                                      if u.tag == list4_name:
                                           yield u.text

def method3(list1_name, list2_name):
     list_of_tuples = []
     if subnode.tag == list1_name:
          for i in subnode:
               if i.tag == list2_name:
                    temp_list = []
                    for a in i:
                         temp_list.append(a.text)
                         list_of_tuples.append(temp_list)
     return list_of_tuples

alternative_parents = []
substituents = []
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
for node in root:
     for subnode in node:
          print method1('{http://www.drugbank.ca}groups','group_list')
          print method1('{http://www.drugbank.ca}synonyms','synonym_list')
          print method1('{http://www.drugbank.ca}patent','patent_list')
          print method2('{http://www.drugbank.ca}general-references','{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')#
          if subnode.tag == '{http://www.drugbank.ca}classification':
               for each_item in list_to_run_thru:
                    for i in subnode:
                         if i.tag == ap_sub(each_item):
                              print i.text
                         if i.tag == '{http://www.drugbank.ca}alternative-parent':
                              alternative_parents.append(i.text)
                         if i.tag == '{http://www.drugbank.ca}substituent':
                              substituents.append(i.text)
          print method3('{http://www.drugbank.ca}salts','{http://www.drugbank.ca}salt')
          print method3('{http://www.drugbank.ca}products','{http://www.drugbank.ca}product')
          print method3('{http://www.drugbank.ca}mixtures','{http://www.drugbank.ca}mixture')
          print method3('{http://www.drugbank.ca}packagers','{http://www.drugbank.ca}packager')
          print method3('{http://www.drugbank.ca}categories','{http://www.drugbank.ca}category')
          print method3('{http://www.drugbank.ca}dosages','{http://www.drugbank.ca}dosage')
          print method3('{http://www.drugbank.ca}atc-codes','{http://www.drugbank.ca}atc-code')
          print method3('{http://www.drugbank.ca}ahfs-codes','{http://www.drugbank.ca}ahfs-code')
          print method3('{http://www.drugbank.ca}pdb-entries','{http://www.drugbank.ca}pdb-entry')
          print method3('{http://www.drugbank.ca}food-interactions','{http://www.drugbank.ca}food-interaction')
          print method3('{http://www.drugbank.ca}drug-interactions','{http://www.drugbank.ca}drug-interaction')

          print method3('{http://www.drugbank.ca}calculated-properties','{http://www.drugbank.ca}property')
          print method3('{http://www.drugbank.ca}external-identifiers','{http://www.drugbank.ca}external-identifier')
          print method3('{http://www.drugbank.ca}external-links','{http://www.drugbank.ca}external-link')
          print method3('{http://www.drugbank.ca}snp-adverse-drug-reactions','{http://www.drugbank.ca}reaction')

print substituents
print alternative_parents
'''


        if subnode.tag == '{http://www.drugbank.ca}pathways':
            for i in subnode:
                if i.tag == '{http://www.drugbank.ca}pathway':
                    for a in i:
                        print a.text
                        for u in a:
                            if u.tag == '{http://www.drugbank.ca}drug':
                                for x in u:
                                        print x.text

#missing a bit of data here
        if subnode.tag == '{http://www.drugbank.ca}targets':

            for i in subnode:
                if i.tag == '{http://www.drugbank.ca}target':
                    print i.text
                    for a in i:
                        print a.text
                        if a.tag == '{http://www.drugbank.ca}actions':
                            for u in a:
                                print u.text
                        if a.tag == '{http://www.drugbank.ca}references':
                            for u in a:
                                if u.tag == '{http://www.drugbank.ca}articles':
                                    for x in u:
                                        if x.tag == '{http://www.drugbank.ca}article':
                                            for z in x:
                                                print z.text
'''

1 个答案:

答案 0 :(得分:1)

我使用BeautifulSoup进行解析,因为它是一个简单的库。

代码:

a   5
b   8

输出:

import pprint
import requests
from bs4 import BeautifulSoup

html = requests.get('https://www.drugbank.ca/drugs/DB01048#BE0004136').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')

t = {}
for target in targets:
    k = []
    v = []
    for property in target.find_all('dt'):
        k.append(property.get_text())
    for property in target.find_all('dd'):
        v.append(property.get_text())
    t[target.find('strong').get_text()] = dict(zip(k, v))
pprint.pprint(t)