以下代码在从网页中抓取字段时工作正常,但是我想再抓取网页上的一条信息(实际学习完成日期)。
我将它添加到名为“subset”的列表的末尾,认为它会找到这个字段并像其他人一样抓取信息。但这不是刮这个领域吗?
我怎样才能得到这个?
(为了便于参考,URL 为 https://clinicaltrials.gov/ct2/show/study/NCT02170532
import bs4
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
def clinicalTrialsGov(nctid):
data = defaultdict(list)
soup = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['intervention_type', 'study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms','actual_study_completion_date']
for tag in soup.find_all(subset):
data['ct{}'.format(tag.name.capitalize())].append(tag.get_text(strip=True))
for key in data:
print('{}: {}'.format(key, ', '.join(data[key])))
clinicalTrialsGov('NCT02170532')
答案 0 :(得分:1)
不确定要添加到何处。它似乎必须来自其他网址。
您可以为 td
选择 has
具有 data-term
属性且值为“学习完成日期”的孩子,然后使用相邻的兄弟组合子 (+
) 移动到关联日期 td
。
from collections import defaultdict
from bs4 import BeautifulSoup as bs
import requests
def clinicalTrialsGov(nctid):
with requests.Session() as s:
data = defaultdict(list)
soup = bs(s.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['intervention_type', 'study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms','primary_completion_date']
for tag in soup.find_all(subset):
data['ct{}'.format(tag.name.capitalize())].append(tag.get_text(strip=True))
for key in data:
print('{}: {}'.format(key, ', '.join(data[key])))
soup = bs(s.get(f'https://clinicaltrials.gov/ct2/show/study/{nctid}').text, 'lxml')
data['actual_study_completion_date'] = soup.select_one('td:has([data-term="Study Completion Date"]) + td').text
data['Study Start Date'] = soup.select_one('td:has([data-term="Study Start Date"]) + td').text
data['Actual Primary Completion Date'] = soup.select_one('td:has([data-term="Primary Completion Date"]) + td').text
return data
clinicalTrialsGov('NCT02170532')