我通过chromewebdriver(windows)使用selenium和python,以自动执行从不同页面下载大量文件的任务。 我的代码有效,但解决方案远非理想:下面的函数点击启动java脚本函数的网站按钮,生成PDF文件然后下载。
我不得不使用静态等待以等待下载完成(丑陋)我无法检查文件系统以验证下载何时完成,因为我正在使用多线程(下载批量文件)从一次不同的页面)以及文件的名称在网站本身动态生成。
我的代码:
def file_download(num, drivervar):
Counter += 1
try:
drivervar.get(url[num])
download_button = WebDriverWait(drivervar, 20).until(EC.element_to_be_clickable((By.ID, 'download button ID')))
download_button.click()
time.sleep(10)
except TimeoutException: # Retry once
print('Timeout in thread number: ' + str(num) + ', retrying...')
.....
是否可以在webdriver中确定下载完成?我想避免使用time.sleep(x)。
非常感谢。
答案 0 :(得分:12)
您可以通过导航chrome://downloads/
与驱动程序获取每次下载的状态。
等待所有下载完成并列出所有路径:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
var items = downloads.Manager.get().items_;
if (items.every(e => e.state === "COMPLETE"))
return items.map(e => e.file_url);
""")
# waits for all the files to be completed and returns the paths
paths = WebDriverWait(driver, 120, 1).until(every_downloads_chrome)
print(paths)
答案 1 :(得分:5)
使用Chrome 80,我必须通过以下代码更改@ florent-b的答案:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
我相信这是复古兼容的,我的意思是这将适用于旧版本的Chrome。
答案 2 :(得分:2)
以无头模式运行Chrome时,打开chrome://downloads/
会出现问题。
以下功能使用一种复合方法,无论该模式是否为无头模式,它都可以工作,并选择每种模式中可用的更好方法。
假定每次调用此函数后,调用方都会清除在file_download_path
下载的所有文件。
import os
import logging
from selenium.webdriver.support.ui import WebDriverWait
def wait_for_downloads(driver, file_download_path, headless=False, num_files=1):
max_delay = 60
interval_delay = 0.5
if headless:
total_delay = 0
done = False
while not done and total_delay < max_delay:
files = os.listdir(file_download_path)
# Remove system files if present: Mac adds the .DS_Store file
if '.DS_Store' in files:
files.remove('.DS_Store')
if len(files) == num_files and not [f for f in files if f.endswith('.crdownload')]:
done = True
else:
total_delay += interval_delay
time.sleep(interval_delay)
if not done:
logging.error("File(s) couldn't be downloaded")
else:
def all_downloads_completed(driver, num_files):
return driver.execute_script("""
var items = document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList').items;
var i;
var done = false;
var count = 0;
for (i = 0; i < items.length; i++) {
if (items[i].state === 'COMPLETE') {count++;}
}
if (count === %d) {done = true;}
return done;
""" % (num_files))
driver.execute_script("window.open();")
driver.switch_to_window(driver.window_handles[1])
driver.get('chrome://downloads/')
# Wait for downloads to complete
WebDriverWait(driver, max_delay, interval_delay).until(lambda d: all_downloads_completed(d, num_files))
# Clear all downloads from chrome://downloads/
driver.execute_script("""
document.querySelector('downloads-manager').shadowRoot
.querySelector('#toolbar').shadowRoot
.querySelector('#moreActionsMenu')
.querySelector('button.clear-all').click()
""")
driver.close()
driver.switch_to_window(driver.window_handles[0])
答案 3 :(得分:1)
我遇到了同样的问题,并找到了解决方案。您可以检查.crdownload是否在下载文件夹中。如果在下载文件夹中有0个扩展名为.crdownload的文件实例,则所有下载完成。我认为这仅适用于铬和铬。
def downloads_done():
for i in os.listdir("data/"):
if ".crdownload" in i:
time.sleep(0.5)
downloads_done()
每当您调用downloads_done()时,它都会循环运行,直到完成所有下载。如果您要下载80 GB的海量文件,那么我不建议您这样做,因为这样函数可以达到最大递归深度。
答案 4 :(得分:1)
要获得不止一项的退货,我必须通过以下代码更改@thdox的答案:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
var elements = document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items
if (elements.every(e => e.state === 'COMPLETE'))
return elements.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
答案 5 :(得分:0)
import os
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
class MySeleniumTests(unittest.TestCase):
selenium = None
@classmethod
def setUpClass(cls):
cls.selenium = webdriver.Firefox(...)
...
def test_download(self):
os.chdir(self.download_path) # default download directory
# click the button
self.selenium.get(...)
self.selenium.find_element_by_xpath(...).click()
# waiting server for finishing inner task
def download_begin(driver):
if len(os.listdir()) == 0:
time.sleep(0.5)
return False
else:
return True
WebDriverWait(self.selenium, 120).until(download_begin) # the max wating time is 120s
# waiting server for finishing sending.
# if size of directory is changing,wait
def download_complete(driver):
sum_before=-1
sum_after=sum([os.stat(file).st_size for file in os.listdir()])
while sum_before != sum_after:
time.sleep(0.2)
sum_before = sum_after
sum_after = sum([os.stat(file).st_size for file in os.listdir()])
return True
WebDriverWait(self.selenium, 120).until(download_complete) # the max wating time is 120s
你必须做这些事
(我的英语不太好)
答案 6 :(得分:0)
这可能不适用于所有用例,但由于我只需要等待一个pdf下载,效果很好。基于Walter's comment above。
def get_non_temp_len(download_dir):
non_temp_files = [i for i in os.listdir(download_dir) if not (i.endswith('.tmp') or i.endswith('.crdownload'))]
return len(non_temp_files)
download_dir = 'your/download/dir'
original_count = get_non_temp_len(download_dir) # get the file count at the start
# do your selenium stuff
while original_count == get_non_temp_len(download_dir):
time.sleep(.5) # wait for file count to change
driver.quit()
答案 7 :(得分:0)
我遇到了同样的问题,这种方法对我有用。
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from threading import Thread
import os
import datetime
def checkFilePresence(downloadPath, numberOfFilesInitially, artistName,
songTitle):
timeNow = datetime.datetime.now()
found = False
while not found:
numberOfFilesNow = len(os.listdir(downloadPath))
if numberOfFilesNow > numberOfFilesInitially:
for folders, subfolders, files in os.walk(downloadPath):
for file in files:
modificationTime = datetime.datetime.fromtimestamp\
(os.path.getctime(os.path.join(folders, file)))
if modificationTime > timeNow:
if file.endswith('.mp3'):
return
答案 8 :(得分:0)
此代码在无头模式下工作并返回下载的文件名(基于 @protonum 代码):
def wait_for_downloads(download_path):
max_delay = 30
interval_delay = 0.5
total_delay = 0
file = ''
done = False
while not done and total_delay < max_delay:
files = [f for f in os.listdir(download_path) if f.endswith('.crdownload')]
if not files and len(file) > 1:
done = True
if files:
file = files[0]
time.sleep(interval_delay)
total_delay += interval_delay
if not done:
logging.error("File(s) couldn't be downloaded")
return download_path + '/' + file.replace(".crdownload", "")
答案 9 :(得分:0)
def wait_for_download_to_be_don(self, path_to_folder, file_name):
max_time = 60
counter = 0
while not os.path.exists(path_to_folder + file_name) and time_counter < max_time:
sleep(0.5)
time_counter += 0.5
if time_counter == max_time:
assert os.path.exists(path_to_folder + file_name), "The file wasn't downloaded"
答案 10 :(得分:-1)
使用测试自动化时,开发人员使软件可测试至关重要。您的工作是检查软件和可测试性,这意味着您需要请求微调器或简单的HTML标签,以指示下载何时成功完成。
在您的情况下,您无法在用户界面中查看它而无法登记系统,这是解决问题的最佳方法。