我要提取具有9000页的数据。提取大约1700页后,当我希望它继续工作时便停止工作,它从头开始,并在大约1000页后执行。在此代码中,我必须手动选择区域。如何剪贴总页数? chromedriver(用于会话)有时间限制吗?
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import csv
url = "https://www.mcg.gov.in/default1.aspx?HTT=B"
driver = webdriver.Chrome(executable_path = 'D:/Python_module/chromedriver_win32/chromedriver.exe')
driver.get(url)
time.sleep(4)
driver.find_element_by_xpath('//*[@id="CphContentPlaceHolderbody_mcg"]/section/div[1]/div/a[1]/div').click()
time.sleep(2)
driver.find_element_by_xpath('//*[@id="CphContentPlaceHolderbody_lnkViewSurveyDataBtn"]').click()
time.sleep(4)
driver.find_element_by_xpath('//*[@id="CphContentPlaceHolderbody_PropertySearchControl1_btnSearch"]').click()
time.sleep(4)
#-----------------This is for extracting the data of page-1-----------------------------------
driver.find_element_by_xpath('//*[@id="form"]/div[4]/div[11]/table/tbody/tr/td[12]/a').click()
time.sleep(1)
print("If you are in second page then the code is fine.")
soup = BeautifulSoup(driver.page_source, 'html.parser')
current_url = driver.current_url
table = soup.find('table', {'class':'table table-hover table-bordered'})
#divs = soup.find('div', {'id':'CphContentPlaceHolderbody_PropertySearchControl1_upTop'})
print(table)
for row in table.findAll('tr')[1:]:
raw_data = row.findAll('td')[0:]
property_id = raw_data[0].text
ward_no = raw_data[1].text
owner = raw_data[2].text
print(owner)
page_no = page_no+1
try:
while True:
driver.find_element_by_xpath('//*[@id="form"]/div[4]/div[11]/table/tbody/tr/td[14]/a').click()
time.sleep(1)
print("If you are in second page then the code is fine.")
soup = BeautifulSoup(driver.page_source, 'html.parser')
current_url = driver.current_url
table = soup.find('table', {'class':'table table-hover table-bordered'})
#divs = soup.find('div', {'id':'CphContentPlaceHolderbody_PropertySearchControl1_upTop'})
#print(table)
for row in table.findAll('tr')[1:]:
raw_data = row.findAll('td')[0:]
property_id = raw_data[0].text
ward_no = raw_data[1].text
owner = raw_data[2].text
print(owner)
page_no = page_no+1
except:
while True:
driver.find_element_by_xpath('//*[@id="form"]/div[4]/div[11]/table/tbody/tr/td[19]/a').click()
time.sleep(1)
print("If you are in second page then the code is fine.")
soup = BeautifulSoup(driver.page_source, 'html.parser')
current_url = driver.current_url
table = soup.find('table', {'class':'table table-hover table-bordered'})
#divs = soup.find('div', {'id':'CphContentPlaceHolderbody_PropertySearchControl1_upTop'})
#print(table)
for row in table.findAll('tr')[1:]:
raw_data = row.findAll('td')[0:]
owner = raw_data[2].text
print(owner)
page_no = page_no+1
print("Successfully scrap the data")
driver.quit()
它给出以下错误:
Traceback (most recent call last):
File "D:\C Folder\program\scrap\scrap_mcg.py", line 64, in <module>
element = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="form"]/div[4]/div[11]/table/tbody/tr/td[14]/a')))
File "C:\Users\asn\AppData\Local\Programs\Python\Python36-32\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\C Folder\program\scrap\scrap_mcg.py", line 90, in <module>
soup = BeautifulSoup(driver.page_source, 'html.parser')
File "C:\Users\asn\AppData\Local\Programs\Python\Python36-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 670, in page_source
return self.execute(Command.GET_PAGE_SOURCE)['value']
File "C:\Users\asn\AppData\Local\Programs\Python\Python36-32\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 312, in execute
self.error_handler.check_response(response)
File "C:\Users\asn\AppData\Local\Programs\Python\Python36-32\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
(Session info: chrome=69.0.3497.100)
(Driver info: chromedriver=2.37.544315 (730aa6a5fdba159ac9f4c1e8cbc59bf1b5ce12b7),platform=Windows NT 6.1.7601 SP1 x86_64)
答案 0 :(得分:0)
title =“ Page 336”>旁边的...在点(988, 604)。其他元素将获得点击:...
如错误所述,该元素无法接受点击。 可能有各种各样的东西阻止点击元素。
几种解决问题的方法:
如果我们还有其他需要帮助的地方,请告诉我们。