Question

我是python和web-scraping的新手。我一直试图从网站上提取一些数据。到目前为止，我已经走了多远。

获取标题索引中每个标题的标题

获取常见问题部分

一目了然

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

url = 'https://labtestsonline.org/tests-index'
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')



# get the list of hyperlinks on the webpage
 hyperlinks = []

# function to get hyperlinks of all test components on the index page
def parseUrl(url):
global hyperlinks
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
for a in soup.findAll('div',{'class':'field-content'}):
    a = a.find('a')
    href = urljoin(url,a.get('href'))
    hyperlinks.append(href)

parseUrl(url)

# header
h = [] 
# common questions section
p1 = []   
p2 = []
p3 = []
p4 = []
p5 = []

# At a glance section
g1 = []
g2 = []
g3 = []
g4 = []

 # Function to get header, common questions section and at a glance section
 def bucket(url):
   page = requests.get(url).content
   soup = BeautifulSoup(page, 'lxml')
   global h
   h = soup.find('div',{'class':'field-wrapper field field-node--title 
   field-name-title field-type-string field-label-hidden'}).get_text()
   p = []  

   for t in soup.findAll('div', {'class':'accordion-content'}):
     p.append(t.get_text())
    global p1
    global p2
    global p3
    global p4
    global p5
    p1 = p[0]
    p2 = p[1]
    p3 = p[2]
    p4 = p[3]
    p5 = p[4]

 g = []   

for p in soup.find_all('div', {'class':'medium-6 columns paragraph paragraph--type--text-area paragraph--view-mode--default'}):
    g.append(p.find('p').get_text())
    global g1
    global g2
    global g3
    global g4
    g1 = g[0]
    g2 = g[1]
    g3 = g[2]
    g4 = g[3]

storedata(h,p1, p2, p3, p4, p5, g1, g2, g3, g4 )

 # csv file to be written
 storefile = 'h1.csv'

 # function to write to csv file
 def storedata(h,p1, p2, p3, p4, p5, g1, g2, g3, g4 ):
   global storefile
   handle = open(storefile, 'a+')
   writer = csv.writer(handle)
   writer.writerow((h,p1, p2, p3, p4, p5, g1, g2, g3, g4))
   handle.close()

# pull data from  Hyperlinks

  for i in range(0,len(hyperlinks)):
     bucket(hyperlinks[i])enter code here

这是我得到的错误。

IndexError                                Traceback (most recent call last)
<ipython-input-21-689b496a8f00> in <module>()
  2 
  3 for i in range(0,len(h1)):
  ----> 4     bucket(h1[i])

 <ipython-input-19-ca0a2210fce6> in bucket(url)
      31         global p5
      32         p1 = p[0]
 ---> 33         p2 = p[1]
      34         p3 = p[2]
      35         p4 = p[3]

    IndexError: list index out of range

大约有1300个链接我只能从300个链接中提取信息。循环在此时停止并抛出上述错误。

Answer 1

你的缩进看起来很难看，所以很难判断这些行是否应该在for t in soup.findAll('div', ...):循环中。如果它们是，那么代码可能会更早失败，因此它们不会，因此您遇到的页面只有一个<div>。因此，您应该在尝试访问元素之前检查p的长度，以及稍后使用g进行检查。

Web抓取：IndexError：列表索引超出范围

1 个答案: