我尝试过多种自动从网站下载pdf文档的方法,并决定使用Selenium来浏览网站并下载* .pdf文件。但是,我无法阻止下载框弹出。
这样做的帮助将受到大力赞赏......
这是我的剧本:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
import os.path
#setting up the FireFox profile so that no download box pops up
profile = webdriver.FirefoxProfile();
profile.set_preference("browser.download.folderList", 2);
profile.set_preference("browser.download.alertOnEXEOpen", False);
profile.set_preference("browser.helperApps.neverAsksaveToDisk", "application/x-pdf, application/acrobat, applications/vnd.pdf, text/pdf, text/x-pdf, application/vnd.cups-pdf, text/csv,application/x-msexcel,application/excel,application/x-excel,application/vnd.ms-excel,image/png,image/jpeg,text/html,text/plain,application/msword,application/xml,application/pdf");
profile.set_preference("browser.download.manager.showWhenStarting", False);
profile.set_preference("browser.download.manager.focusWhenStarting", False);
profile.set_preference("browser.helperApps.alwaysAsk.force", False);
profile.set_preference("browser.download.manager.alertOnEXEOpen", False);
profile.set_preference("browser.download.manager.closeWhenDone", False);
profile.set_preference("browser.download.manager.showAlertOnComplete", False);
profile.set_preference("browser.download.manager.useWindow", False);
profile.set_preference("browser.download.manager.showWhenStarting", False);
profile.set_preference("services.sync.prefs.sync.browser.download.manager.showWhenStarting", False);
profile.set_preference("pdfjs.disabled", True);
#opens the Firefox browser and goes to the website
browser = webdriver.Firefox(profile)
browser.get('http://www.smad.gov.sk.ca/Pages/BasePages/Main.aspx?UseCase=ExternalSearch')
#finds the query box
elem = browser.find_element_by_id('ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_txtFileNumber')
#submits the query
elem.send_keys('-' + Keys.RETURN)
#explicit wait until ready to continue
try:
x = WebDriverWait(browser, 80).until(EC.presence_of_element_located((By.NAME, "ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$grdMainSearch$ctl03$btnViewDetails")))
finally: #once it has waited 80 seconds, or until the content is loaded, then it continues and selects the first file
browser.find_element(By.NAME, "ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$grdMainSearch$ctl03$btnViewDetails").click()
try:
x = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$ucView$ucAssessmentFileView$btnMapsAndDocumentsTreeView")))
finally: #wait til load, then selects the "view data" tab
browser.find_element(By.NAME, "ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$ucView$ucAssessmentFileView$btnMapsAndDocumentsTreeView").click()
try:
x = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$ucView$ucMapsAndDocuments$btnRefresh")))
finally: #wait til load, then selects the "ALL" in category and proceeed
browser.find_elements_by_css_selector("input[type='radio'][name='ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$ucView$ucMapsAndDocuments$lstCategoryTypes']")[4].click()
browser.find_element(By.NAME, "ctl00$ContentPlaceHolder1$plc1Content$ucExternalAssessmentSearchView$ucView$ucMapsAndDocuments$btnRefresh").click()
try:
x = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilen0")))
finally: #wait til load, then expand the elements
browser.find_element(By.ID, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilen3").click()
browser.find_element(By.ID, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilen0").click()
browser.find_element(By.ID, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilen5").click()
browser.find_element(By.ID, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilen7").click()
browser.find_element(By.ID, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilen9").click()
save_path = "C:/Users/Jacob/Documents/ArcGIS/Saskachewan/Assessment work/"
Name_of_folder = browser.find_element(By.ID,"ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_lblAssessmentFileNumberValue").text
newpath = os.path.join(save_path, Name_of_folder)
if not os.path.exists(newpath):
os.makedirs(newpath)
profile.set_preference("browser.download.dir",("browser.download.dir", newpath));
#download files
browser.find_element(By.ID, "ctl00_ContentPlaceHolder1_plc1Content_ucExternalAssessmentSearchView_ucView_ucMapsAndDocuments_tvLogFolderAndFilet1").click()
答案 0 :(得分:0)
在Firefox中,您可以设置此首选项
browser.download.manager.showAlertOnComplete
到false
禁用这些下载弹出窗口。