我是python和web-scraping的新手。我一直试图从网站上提取一些数据。到目前为止,我已经走了多远。
- 获取标题索引中每个标题的标题
- 获取常见问题部分
- 一目了然
醇>
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
url = 'https://labtestsonline.org/tests-index'
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
# get the list of hyperlinks on the webpage
hyperlinks = []
# function to get hyperlinks of all test components on the index page
def parseUrl(url):
global hyperlinks
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
for a in soup.findAll('div',{'class':'field-content'}):
a = a.find('a')
href = urljoin(url,a.get('href'))
hyperlinks.append(href)
parseUrl(url)
# header
h = []
# common questions section
p1 = []
p2 = []
p3 = []
p4 = []
p5 = []
# At a glance section
g1 = []
g2 = []
g3 = []
g4 = []
# Function to get header, common questions section and at a glance section
def bucket(url):
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
global h
h = soup.find('div',{'class':'field-wrapper field field-node--title
field-name-title field-type-string field-label-hidden'}).get_text()
p = []
for t in soup.findAll('div', {'class':'accordion-content'}):
p.append(t.get_text())
global p1
global p2
global p3
global p4
global p5
p1 = p[0]
p2 = p[1]
p3 = p[2]
p4 = p[3]
p5 = p[4]
g = []
for p in soup.find_all('div', {'class':'medium-6 columns paragraph paragraph--type--text-area paragraph--view-mode--default'}):
g.append(p.find('p').get_text())
global g1
global g2
global g3
global g4
g1 = g[0]
g2 = g[1]
g3 = g[2]
g4 = g[3]
storedata(h,p1, p2, p3, p4, p5, g1, g2, g3, g4 )
# csv file to be written
storefile = 'h1.csv'
# function to write to csv file
def storedata(h,p1, p2, p3, p4, p5, g1, g2, g3, g4 ):
global storefile
handle = open(storefile, 'a+')
writer = csv.writer(handle)
writer.writerow((h,p1, p2, p3, p4, p5, g1, g2, g3, g4))
handle.close()
# pull data from Hyperlinks
for i in range(0,len(hyperlinks)):
bucket(hyperlinks[i])enter code here
这是我得到的错误。
IndexError Traceback (most recent call last)
<ipython-input-21-689b496a8f00> in <module>()
2
3 for i in range(0,len(h1)):
----> 4 bucket(h1[i])
<ipython-input-19-ca0a2210fce6> in bucket(url)
31 global p5
32 p1 = p[0]
---> 33 p2 = p[1]
34 p3 = p[2]
35 p4 = p[3]
IndexError: list index out of range
大约有1300个链接我只能从300个链接中提取信息。循环在此时停止并抛出上述错误。
答案 0 :(得分:0)
你的缩进看起来很难看,所以很难判断这些行是否应该在for t in soup.findAll('div', ...):
循环中。如果它们是,那么代码可能会更早失败,因此它们不会,因此您遇到的页面只有一个<div>
。因此,您应该在尝试访问元素之前检查p
的长度,以及稍后使用g
进行检查。