我正在尝试在网站上获取超过一千个页面的HTML。 我的脚本正在运行,但是在随机数量的页面恢复之后,脚本无缘无故地转到下一页。他没有把我想要的所有页面都计入。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import os.path
classes = ["sup","spé","b1","b2","b3","2C",
"2A","2D","m1","m2","m3","2B","3C","3B",'3D',"3A"]
def CrawlingAll():
for Classes in classes :
dir = os.path.dirname(__file__)
print(dir)
filename = os.path.join(dir, 'Fiche', Classes)
print(filename)
if not os.path.exists(filename):
os.makedirs(filename)
driver = webdriver.Chrome()
#Identification
driver.get("https://username:pw@web.esme.fr/sas/common/archives.aspx")
year = driver.find_element_by_id("sas_SelectionPlaceHolder__anneeSelect")
year.send_keys("1998")
classe = driver.find_element_by_id("sas_SelectionPlaceHolder_nomClasse")
classe.send_keys(Classes)
search = driver.find_element_by_id("sas_SelectionPlaceHolder__btnSubmit")
search.click()
#iteration du clicks des fiches eleves
try:
for i in range(1, 10000):
fiche = driver.find_element_by_xpath('//*[@id="liste"]/tbody/tr[%s]/td[4]/a'%(i))
fiche.click()
driver.switch_to_window(driver.window_handles[1]) #switch vers la pop up
source_code = driver.page_source #recuperation du code source de la pop up
driver.close() #fermeture de la pop up
driver.switch_to_window(driver.window_handles[0]) #retour a la fenetre principale
#Ecriture des fiches html / eleves
with open("%s/file-"%filename + str(i) + ".html", 'wb') as f:
f.write(source_code.encode('utf-16'))
f.close()
except:
print ("======DONE======")
driver.close() #fermeture de la fenetre principale
def Crawlingclasses():
print("Select the class your want to crawl: ")
print(classes)
Classe = input ()
print("Select when do you want to start crawling ex: 2015: ")
Year = input()
dir = os.path.dirname(__file__)
print (dir)
filename = os.path.join(dir,'Fiche',Classe)
print (filename)
if not os.path.exists(filename):
os.makedirs(filename)
driver = webdriver.Chrome()
# Identification
driver.get("https://username:pw@web.esme.fr/sas/common/archives.aspx")
# Remplissage des boxs
# ================================= CRAWLING SUP =================================
year = driver.find_element_by_id("sas_SelectionPlaceHolder__anneeSelect")
year.send_keys(Year)
classe = driver.find_element_by_id("sas_SelectionPlaceHolder_nomClasse")
classe.send_keys(Classe)
search = driver.find_element_by_id("sas_SelectionPlaceHolder__btnSubmit")
search.click()
# iteration du clicks des fiches eleves
try:
for i in range(1, 10000):
fiche = driver.find_element_by_xpath('//*[@id="liste"]/tbody/tr[%s]/td[4]/a' % (i))
fiche.click()
driver.switch_to_window(driver.window_handles[1]) # switch vers la pop up
source_code = driver.page_source # recuperation du code source de la pop up
driver.close() # fermeture de la pop up
driver.switch_to_window(driver.window_handles[0]) # retour a la fenetre principale
# Ecriture des fiches html / eleves
with open("%s/file-"%filename + str(i) + ".html", 'wb') as f:
f.write(source_code.encode('utf-16'))
f.close()
except:
print("======DONE======")
driver.close() # fermeture de la fenetre principale
尝试获取的页数在这里尝试:
for i in range(1, 10000):
知道为什么硒会跳过很多页面吗?
答案 0 :(得分:0)
我非常确定您的代码会出现错误,但您没有在您尝试的情况下打印错误消息,除了阻止,很难说出哪种错误。
现在我建议如果遇到错误而不是跳过整个循环,则跳过迭代:
import sys
for i in range(1, 10000):
try:
fiche = driver.find_element_by_xpath('//*[@id="liste"]/tbody/tr[%s]/td[4]/a' % (i))
fiche.click()
driver.switch_to_window(driver.window_handles[1]) # switch vers la pop up
source_code = driver.page_source # recuperation du code source de la pop up
driver.close() # fermeture de la pop up
driver.switch_to_window(driver.window_handles[0]) # retour a la fenetre principale
# Ecriture des fiches html / eleves
with open("%s/file-"%filename + str(i) + ".html", 'wb') as f:
f.write(source_code.encode('utf-16'))
f.close()
except:
print("Error at iteration %s:"%i, sys.exc_info()[0])