在尝试使用硒抓取该网站时。
我可以使用该代码,但目前仅刮取第一页。该页面使用输入按钮作为浏览页面的一种方式,所以我想一个个地单击每个按钮,但这是行不通的,有人有其他方法可以处理这种类型的分页吗?
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options,
executable_path=r'/Users/liban/Downloads/chromedriver')
url = 'http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true'
driver.get(url)
def get_Data():
data = []
divs = driver.find_element_by_xpath('//*[@id="content"]/form').find_elements_by_tag_name('div')
for div in divs:
app_number = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').text
address = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[5]').text
status = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[1]/strong').text
link = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').get_attribute("href")
proposals = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[3]').text
data.append({"caseRef": app_number, "propDesc": proposals, "address": address, "caseUrl": link, "status": status})
print(data)
return data
def navigation():
data = []
total_inputs = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/table/tbody/tr/td/input')
for input in total_inputs:
input.click()
data.extend(get_Data())
def main():
all_data = []
select = Select(driver.find_element_by_xpath('//*[@id="DatePresets"]'))
select.select_by_index(7)
search_by = driver.find_element_by_xpath('//*[@id="radio-ReceivedDate"]')
search_by.click()
show = Select(driver.find_element_by_xpath('//*[@id="ResultSize"]'))
show.select_by_index(4)
search_button = driver.find_element_by_xpath('//*[@id="content"]/form/input[3]')
search_button.click()
all_data.extend(navigation())
if __name__ == "__main__":
main()
网站如何处理分页:
<td align="center">
<input type="submit" class="pageNumberButton selected" name="searchResults_Page" value="1" disabled="disabled"/>
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="2" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="3" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="4" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="5" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="6" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="7" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="8" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="9" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="10" />
</td>
手动步骤:
答案 0 :(得分:2)
尝试:
find_elements_by_xpath
而非find_element_by_xpath
,它将返回您的列表。
PS:我没有在本地尝试您的代码,但是您提到的错误是我提到的解决方案。
答案 1 :(得分:1)
根据您在http://www.boston.gov.uk/index.aspx?articleid=6207
网站上处理分页的问题,可以使用以下解决方案:
代码块:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get('http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true')
mySelectElement = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#DatePresets[name='DatePresets']"))))
mySelectElement.select_by_visible_text('Last month')
driver.find_element_by_css_selector("input.button[name='searchFilter']").click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "input.pageNumberButton"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[@class='pageNumberButton selected']//following::input[1]"))).click()
driver.quit()
控制台输出:
DevTools listening on ws://127.0.0.1:12115/devtools/browser/2ece3f6a-0431-4b74-9276-f61fcf70dd6d
10
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
Perform your scrapping here on page 9
Perform your scrapping here on page 10