我需要从网页下载一组单独的pdf文件。它是政府(土耳其教育部)公开提供的,因此完全合法。
但是我的硒浏览器仅显示pdf文件,如何下载并命名。
(此代码也来自网络)
# Import your newly installed selenium package
from selenium import webdriver
from bs4 import BeautifulSoup
# Now create an 'instance' of your driver
# This path should be to wherever you downloaded the driver
driver = webdriver.Chrome(executable_path="/Users/ugur/Downloads/chromedriver")
# A new Chrome (or other browser) window should open up
download_dir = "/Users/ugur/Downloads/" # for linux/*nix, download_dir="/usr/Public"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
# Now just tell it wherever you want it to go
driver.get("https://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid=5&ders=29")
driver.find_element_by_id("ContentPlaceHolder1_dtYillikPlanlar_lnkIndir_2").click()
driver.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
预先感谢
其他信息:
我有一个完美的python 2代码。但是,它以某种方式创建了空文件,我无法将其转换为python3。也许这有所帮助(没有冒犯性,但我从不喜欢硒)
import urllib
import urllib2
from bs4 import BeautifulSoup
import os
sinifId=5
maxOrd = 1
fileNames=[]
directory = '/Users/ugur/Downloads/Hasan'
print 'List of current files in directory '+ directory+'\n---------------------------------\n\n'
for current_file in os.listdir(directory):
if (current_file.find('pdf')>-1 and current_file.find(' ')>-1):
print current_file
order = int(current_file.split(' ',1)[0])
if order>maxOrd: maxOrd=order
fileNames.append(current_file.split(' ',2)[1])
print '\n\nStarting download \n---------------------------------\n'
ctA=int(maxOrd+1)
for ders in [29]:
urlSinif='http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
page = urllib2.urlopen(urlSinif)
soup = BeautifulSoup(page,"lxml")
st = soup.prettify()
count=st.count('ctl00')-1
dersAdi = soup.find('a', href='/kurslar/CevapAnahtarlari.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)).getText().strip()
for testNo in range(count):
if(str(sinifId)+str(ders)+str(testNo+1) in fileNames):
print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' skipped'
else:
annex=""
if(testNo%2==1): annex="2"
eiha_url = u'http://odsgm.meb.gov.tr/kurslar/KazanimTestleri.aspx?sinifid='+str(sinifId)+'&ders='+str(ders)
data = ('__EVENTTARGET','ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex), ('__EVENTARGUMENT', '39')
print 'ctl00$ContentPlaceHolder1$dtYillikPlanlar$ctl'+format(testNo, '02')+'$lnkIndir'+annex
new_data = urllib.urlencode(data)
response = urllib2.urlopen(eiha_url, new_data)
urllib.urlretrieve (str(response.url), directory+'/{0:0>3}'.format(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf')
print str(ctA)+' '+str(sinifId)+str(ders)+str(testNo+1)+' '+dersAdi+str(testNo+1)+'.pdf'+' downloaded'
ctA=ctA+1
答案 0 :(得分:2)
在启动Chrome之前添加您的选项,然后指定chrome_options
参数。
download_dir = "/Users/ugur/Downloads/"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
"download.default_directory": download_dir,
"download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome(
executable_path="/Users/ugur/Downloads/chromedriver",
chrome_options=options
)
回答第二个问题:
我还能问一下如何指定文件名吗?
我发现了这个:Selenium give file name when downloading
我要做的是:
file_name = ''
while file_name.lower().endswith('.pdf') is False:
time.sleep(.25)
try:
file_name = max([download_dir + '/' + f for f in os.listdir(download_dir)], key=os.path.getctime)
except ValueError:
pass
答案 1 :(得分:0)
非硒解决方案,您可以执行以下操作:
import requests
pdf_resp = requests.get("https://odsgm.meb.gov.tr/kurslar/PDFFile.aspx?name=kazanimtestleri.pdf")
with open("save.pdf", "wb") as f:
f.write(pdf_resp.content)
尽管您可能要先检查内容类型以确保它是pdf
答案 2 :(得分:0)
这是我用来下载具有特定文件名的pdf的代码示例。首先,您需要使用必需的选项配置chrome webdriver。然后,单击按钮(打开pdf弹出窗口)后,调用一个函数,等待下载完成并重命名下载的文件。
import os
import time
import shutil
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
# function to wait for download to finish and then rename the latest downloaded file
def wait_for_download_and_rename(newFilename):
# function to wait for all chrome downloads to finish
def chrome_downloads(drv):
if not "chrome://downloads" in drv.current_url: # if 'chrome downloads' is not current tab
drv.execute_script("window.open('');") # open a new tab
drv.switch_to.window(driver.window_handles[1]) # switch to the new tab
drv.get("chrome://downloads/") # navigate to chrome downloads
return drv.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
# wait for all the downloads to be completed
dld_file_paths = WebDriverWait(driver, 120, 1).until(chrome_downloads) # returns list of downloaded file paths
# Close the current tab (chrome downloads)
if "chrome://downloads" in driver.current_url:
driver.close()
# Switch back to original tab
driver.switch_to.window(driver.window_handles[0])
# get latest downloaded file name and path
dlFilename = dld_file_paths[0] # latest downloaded file from the list
# wait till downloaded file appears in download directory
time_to_wait = 20 # adjust timeout as per your needs
time_counter = 0
while not os.path.isfile(dlFilename):
time.sleep(1)
time_counter += 1
if time_counter > time_to_wait:
break
# rename the downloaded file
shutil.move(dlFilename, os.path.join(download_dir,newFilename))
return
# specify custom download directory
download_dir = r'c:\Downloads\pdf_reports'
# for configuring chrome pdf viewer for downloading pdf popup reports
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', {
"download.default_directory": download_dir, # Set own Download path
"download.prompt_for_download": False, # Do not ask for download at runtime
"download.directory_upgrade": True, # Also needed to suppress download prompt
"plugins.plugins_disabled": ["Chrome PDF Viewer"], # Disable this plugin
"plugins.always_open_pdf_externally": True, # Enable this plugin
})
# get webdriver with options for configuring chrome pdf viewer
driver = webdriver.Chrome(options = chrome_options)
# open desired webpage
driver.get('https://mywebsite.com/mywebpage')
# click the button to open pdf popup
driver.find_element_by_id('someid').click()
# call the function to wait for download to finish and rename the downloaded file
wait_for_download_and_rename('My file.pdf')
# close the browser windows
driver.quit()
根据需要将超时(120)设置为等待时间。