在第10页之后,我没有努力去抓ASP网站

时间:2017-09-26 16:42:14

标签: python selenium selenium-webdriver web-scraping

我正在废弃的网站是:

http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx

我正在使用我的代码查看分页编号并迭代它们但是当它想要超过第10页时失败,因为有三个点(...),如果你点击在浏览器中,它加载第11页(第20页,第30页后相同)。如何更新下面的代码,以便它可以处理此错误而不会破坏?

我使用的代码是:

import re
import string
import urlparse

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup

class DoctorScraper(object):
    def __init__(self):
        self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
        self.driver = webdriver.PhantomJS()
        self.driver.set_window_size(1120, 550)

    def scrape(self):
        self.driver.get(self.url)

        # choose to search using the region 
        try:
            self.driver.find_element_by_id('SearchChkb_5').click()
        except NoSuchElementException:
            pass

        #get the provinces that are available
        select = Select(self.driver.find_element_by_id('ddlProvince'))
        option_indexes = range(1, len(select.options))

        #iterate through the provinces
        for index in option_indexes[:3]:
            select.select_by_index(index)
            #click the search button
            self.driver.find_element_by_id('cmdSearch').click()

            pageno = 2

            while True:
                #create a beautiful soup of the page source code
                s = BeautifulSoup(self.driver.page_source)
                #get all links that match seeing practitioner profile
                r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$')
                #create a dictionary of the attributes
                x = {'href': r1}

                #so in the page source, find all links that have the attributes stated in x
                for a in s.findAll('a', attrs=x):
                    print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href'])
                    print 

                # Pagination
                try:                    
                    next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno)
                    print "Next page: ", next_page_elem
                except NoSuchElementException:
                    break # no more pages

                print 'page ', pageno, '\n'
                next_page_elem.click()

                pageno += 1

        self.driver.quit()

if __name__ == '__main__':
    scraper = DoctorScraper()
    scraper.scrape()

我收到此错误:

StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}}

1 个答案:

答案 0 :(得分:0)

此网站的主要问题是可点击元素经常超出视线范围,并引发element not clickable错误。但是,我已经修好了。如果您的计算机上安装了ChromeDriver,只需运行它即可看到魔力。它将完美无缺地遍历所有页面,无论它们有多少。我已经检查过了。

from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx'

def get_content(driver,wait,link):
    driver.get(link)
    driver.find_element_by_id('SearchChkb_5').click()
    select = Select(driver.find_element_by_id('ddlProvince'))
    select.select_by_visible_text('WESTERN CAPE')
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
    elem.click()
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    page_counter = 2
    while True:
        try:
            if not page_counter % 10 == 1:
                driver.find_element_by_link_text(str(page_counter)).click()
                page_counter += 1
            else:
               driver.find_elements_by_link_text("...")[-1].click() 
               time.sleep(2)
               driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
               page_counter += 1
        except NoSuchElementException:
            break

if __name__ == '__main__':
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    try:
        get_content(driver,wait,main_link)
    finally:    
        driver.close()

使用Class:

from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class DoctorScraper(object):
    def __init__(self):
        self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

    def __del__(self):
        self.driver.close()

    def controlling_pagination(self):
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        page_counter = 2
        while True:
            try:
                if not page_counter % 10 == 1:
                    self.driver.find_element_by_link_text(str(page_counter)).click()
                    page_counter += 1
                else:
                   self.driver.find_elements_by_link_text("...")[-1].click() 
                   time.sleep(2)
                   self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                   page_counter += 1
            except NoSuchElementException:
                break

    def get_content(self):
        self.driver.get(self.url)
        self.driver.find_element_by_id('SearchChkb_5').click()
        select = Select(self.driver.find_element_by_id('ddlProvince'))
        select.select_by_visible_text('WESTERN CAPE')
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
        elem.click()
        self.controlling_pagination()

if __name__ == '__main__':
    scraper = DoctorScraper()
    scraper.get_content()

顺便说一下,看看图片的底部,你可以看到页面的变化:

enter image description here