我正在废弃的网站是:
http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx
我正在使用我的代码查看分页编号并迭代它们但是当它想要超过第10页时失败,因为有三个点(...),如果你点击在浏览器中,它加载第11页(第20页,第30页后相同)。如何更新下面的代码,以便它可以处理此错误而不会破坏?
我使用的代码是:
import re
import string
import urlparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
def scrape(self):
self.driver.get(self.url)
# choose to search using the region
try:
self.driver.find_element_by_id('SearchChkb_5').click()
except NoSuchElementException:
pass
#get the provinces that are available
select = Select(self.driver.find_element_by_id('ddlProvince'))
option_indexes = range(1, len(select.options))
#iterate through the provinces
for index in option_indexes[:3]:
select.select_by_index(index)
#click the search button
self.driver.find_element_by_id('cmdSearch').click()
pageno = 2
while True:
#create a beautiful soup of the page source code
s = BeautifulSoup(self.driver.page_source)
#get all links that match seeing practitioner profile
r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$')
#create a dictionary of the attributes
x = {'href': r1}
#so in the page source, find all links that have the attributes stated in x
for a in s.findAll('a', attrs=x):
print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href'])
print
# Pagination
try:
next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno)
print "Next page: ", next_page_elem
except NoSuchElementException:
break # no more pages
print 'page ', pageno, '\n'
next_page_elem.click()
pageno += 1
self.driver.quit()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.scrape()
我收到此错误:
StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}}
答案 0 :(得分:0)
此网站的主要问题是可点击元素经常超出视线范围,并引发element not clickable
错误。但是,我已经修好了。如果您的计算机上安装了ChromeDriver,只需运行它即可看到魔力。它将完美无缺地遍历所有页面,无论它们有多少。我已经检查过了。
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx'
def get_content(driver,wait,link):
driver.get(link)
driver.find_element_by_id('SearchChkb_5').click()
select = Select(driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
try:
get_content(driver,wait,main_link)
finally:
driver.close()
使用Class:
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def __del__(self):
self.driver.close()
def controlling_pagination(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
self.driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
self.driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
def get_content(self):
self.driver.get(self.url)
self.driver.find_element_by_id('SearchChkb_5').click()
select = Select(self.driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
self.controlling_pagination()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.get_content()
顺便说一下,看看图片的底部,你可以看到页面的变化: