Question

在尝试使用硒抓取该网站时。

我可以使用该代码，但目前仅刮取第一页。该页面使用输入按钮作为浏览页面的一种方式，所以我想一个个地单击每个按钮，但这是行不通的，有人有其他方法可以处理这种类型的分页吗？

import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options

options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options, 
executable_path=r'/Users/liban/Downloads/chromedriver')

url = 'http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true'
driver.get(url)


def get_Data():
    data = []
    divs = driver.find_element_by_xpath('//*[@id="content"]/form').find_elements_by_tag_name('div')
    for div in divs:
        app_number = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').text
        address = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[5]').text
        status = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[1]/strong').text
        link = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').get_attribute("href")
        proposals = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[3]').text

        data.append({"caseRef": app_number, "propDesc": proposals, "address": address,  "caseUrl": link, "status": status})
    print(data)
    return data

def navigation():
    data = []
    total_inputs = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/table/tbody/tr/td/input')
    for input in total_inputs:
        input.click()
        data.extend(get_Data())

def main():
    all_data = []
    select = Select(driver.find_element_by_xpath('//*[@id="DatePresets"]'))
    select.select_by_index(7)
    search_by = driver.find_element_by_xpath('//*[@id="radio-ReceivedDate"]')
    search_by.click()
    show = Select(driver.find_element_by_xpath('//*[@id="ResultSize"]'))
    show.select_by_index(4)
    search_button = driver.find_element_by_xpath('//*[@id="content"]/form/input[3]')
    search_button.click()

    all_data.extend(navigation())

 if __name__ == "__main__":
       main()

网站如何处理分页：

  <td align="center">
           <input type="submit" class="pageNumberButton selected" name="searchResults_Page" value="1" disabled="disabled"/>
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="2" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="3" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="4" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="5" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="6" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="7" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="8" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="9" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="10" />
    </td>

手动步骤：

选择预设日期=“上个月”
搜索依据='两个日期'
点击搜索
在抓取每个页面之后，转到下一页，依此类推，直到没有更多页面，然后再返回原始URL。

Answer 1

尝试： find_elements_by_xpath而非find_element_by_xpath，它将返回您的列表。

PS：我没有在本地尝试您的代码，但是您提到的错误是我提到的解决方案。

Answer 2

根据您在http://www.boston.gov.uk/index.aspx?articleid=6207网站上处理分页的问题，可以使用以下解决方案：

代码块：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select

options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get('http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true')
mySelectElement = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#DatePresets[name='DatePresets']"))))
mySelectElement.select_by_visible_text('Last month')
driver.find_element_by_css_selector("input.button[name='searchFilter']").click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "input.pageNumberButton"))))
print(numLinks)
for i in range(numLinks):
    print("Perform your scrapping here on page {}".format(str(i+1)))
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[@class='pageNumberButton selected']//following::input[1]"))).click()
driver.quit()

控制台输出：

DevTools listening on ws://127.0.0.1:12115/devtools/browser/2ece3f6a-0431-4b74-9276-f61fcf70dd6d
10
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
Perform your scrapping here on page 9
Perform your scrapping here on page 10

使用输入按钮在网站上处理分页

2 个答案: