我的目的是从https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa
下载zip文件
这是此网页https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa中的链接。然后将其保存到此目录"/home/vinvin/shKLSE/
(我正在使用pythonaywhere)。然后将其解压缩并将csv文件解压缩到目录中。
代码运行到最后没有错误,但没有下载。 当手动点击https://www.shareinvestor.com/prices/price_download_zip_file.zip?type=history_all&market=bursa时,会自动下载zip文件。
使用带有工作用户名和密码的代码。使用真实用户名和密码,以便更容易理解问题。
#!/usr/bin/python
print "hello from python 2"
import urllib2
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from pyvirtualdisplay import Display
import requests, zipfile, os
display = Display(visible=0, size=(800, 600))
display.start()
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', '/zip')
for retry in range(5):
try:
browser = webdriver.Firefox(profile)
print "firefox"
break
except:
time.sleep(3)
time.sleep(1)
browser.get("https://www.shareinvestor.com/my")
time.sleep(10)
login_main = browser.find_element_by_xpath("//*[@href='/user/login.html']").click()
print browser.current_url
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("bkcollection")
password.send_keys("123456")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[@type='submit']")
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
time.sleep(30)
browser.close()
browser.quit()
display.stop()
zip_ref = zipfile.ZipFile(/home/vinvin/sh/KLSE, 'r')
zip_ref.extractall(/home/vinvin/sh/KLSE)
zip_ref.close()
os.remove(zip_ref)
HTML片段:
<li><a href="/prices/price_download_zip_file.zip?type=history_all&market=bursa">All Historical Data</a> <span>About 220 MB</span></li>
请注意,复制代码段时会显示&amp; amp。它隐藏在视图源之外,所以我猜它是用JavaScript编写的。
观察我找到了
答案 0 :(得分:3)
我重写了你的脚本,评论解释了为什么我做了我做的改变。我认为你的主要问题可能是一个糟糕的mimetype,但是,你的脚本有一个系统问题的日志,这会使它最多不可靠。此重写使用显式等待,这完全消除了使用time.sleep()
的需要,允许它尽可能快地运行,同时还消除了网络拥塞引起的错误。
您需要执行以下操作以确保安装所有模块:
pip install requests explicit selenium retry pyvirtualdisplay
剧本:
#!/usr/bin/python
from __future__ import print_function # Makes your code portable
import os
import glob
import zipfile
from contextlib import contextmanager
import requests
from retry import retry
from explicit import waiter, XPATH, ID
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
DOWNLOAD_DIR = "/tmp/shKLSE/"
def build_profile():
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', DOWNLOAD_DIR)
# I think your `/zip` mime type was incorrect. This works for me
profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
'application/vnd.ms-excel,application/zip')
return profile
# Retry is an elegant way to retry the browser creation
# Though you should narrow the scope to whatever the actual exception is you are
# retrying on
@retry(Exception, tries=5, delay=3)
@contextmanager # This turns get_browser into a context manager
def get_browser():
# Use a context manager with Display, so it will be closed even if an
# exception is thrown
profile = build_profile()
with Display(visible=0, size=(800, 600)):
browser = webdriver.Firefox(profile)
print("firefox")
try:
yield browser
finally:
# Let a try/finally block manage closing the browser, even if an
# exception is called
browser.quit()
def main():
print("hello from python 2")
with get_browser() as browser:
browser.get("https://www.shareinvestor.com/my")
# Click the login button
# waiter is a helper function that makes it easy to use explicit waits
# with it you dont need to use time.sleep() calls at all
login_xpath = '//*/div[@class="sic_logIn-bg"]/a'
waiter.find_element(browser, login_xpath, XPATH).click()
print(browser.current_url)
# Log in
username = "bkcollection"
username_id = "sic_login_header_username"
password = "123456"
password_id = "sic_login_header_password"
waiter.find_write(browser, username_id, username, by=ID)
waiter.find_write(browser, password_id, password, by=ID, send_enter=True)
# Wait for login process to finish by locating an element only found
# after logging in, like the Logged In Nav
nav_id = 'sic_loggedInNav'
waiter.find_element(browser, nav_id, ID)
print("log in done")
# Load the target page
target_url = ("https://www.shareinvestor.com/prices/price_download.html#/?"
"type=price_download_all_stocks_bursa")
browser.get(target_url)
print(browser.current_url)
# CLick download button
all_data_xpath = ("//*[@href='/prices/price_download_zip_file.zip?"
"type=history_all&market=bursa']")
waiter.find_element(browser, all_data_xpath, XPATH).click()
# This is a bit challenging: You need to wait until the download is complete
# This file is 220 MB, it takes a while to complete. This method waits until
# there is at least one file in the dir, then waits until there are no
# filenames that end in `.part`
# Note that is is problematic if there is already a file in the target dir. I
# suggest looking into using the tempdir module to create a unique, temporary
# directory for downloading every time you run your script
print("Waiting for download to complete")
at_least_1 = lambda x: len(x("{0}/*.zip*".format(DOWNLOAD_DIR))) > 0
WebDriverWait(glob.glob, 300).until(at_least_1)
no_parts = lambda x: len(x("{0}/*.part".format(DOWNLOAD_DIR))) == 0
WebDriverWait(glob.glob, 300).until(no_parts)
print("Download Done")
# Now do whatever it is you need to do with the zip file
# zip_ref = zipfile.ZipFile(DOWNLOAD_DIR, 'r')
# zip_ref.extractall(DOWNLOAD_DIR)
# zip_ref.close()
# os.remove(zip_ref)
print("Done!")
if __name__ == "__main__":
main()
完全披露:我维护显式模块。它旨在使显式等待变得更容易,对于这样的情况,网站根据用户交互缓慢加载动态内容。您可以使用直接显式等待替换上面的所有waiter.XXX
调用。
答案 1 :(得分:3)
我认为你的代码块没有任何重大缺陷。但是这里有一些建议通过这个解决方案&amp;执行此自动测试脚本:
JavaScript
&amp; Ajax Calls
正在发挥作用,处理这些问题超出了本课题的范围。wait
HTML DOM正常呈现。FirefoxProfile
中设置更多偏好设置,如下面的代码所示。browser.maximize_window()
browser.quit()
,则无需使用browser.close()
time.sleep()
替换为ImplicitlyWait
或ExplicitWait
或FluentWait
。这是您自己的代码块,其中包含一些简单的调整:
#!/usr/bin/python
print "hello from python 2"
import urllib2
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from pyvirtualdisplay import Display
import requests, zipfile, os
display = Display(visible=0, size=(800, 600))
display.start()
newpath = 'C:\\home\\vivvin\\shKLSE'
if not os.path.exists(newpath):
os.makedirs(newpath)
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.dir",newpath);
profile.set_preference("browser.download.folderList",2);
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/zip");
profile.set_preference("browser.download.manager.showWhenStarting",False);
profile.set_preference("browser.helperApps.neverAsk.openFile","application/zip");
profile.set_preference("browser.helperApps.alwaysAsk.force", False);
profile.set_preference("browser.download.manager.useWindow", False);
profile.set_preference("browser.download.manager.focusWhenStarting", False);
profile.set_preference("browser.helperApps.neverAsk.openFile", "");
profile.set_preference("browser.download.manager.alertOnEXEOpen", False);
profile.set_preference("browser.download.manager.showAlertOnComplete", False);
profile.set_preference("browser.download.manager.closeWhenDone", True);
profile.set_preference("pdfjs.disabled", True);
for retry in range(5):
try:
browser = webdriver.Firefox(profile)
print "firefox"
break
except:
time.sleep(3)
time.sleep(1)
browser.maximize_window()
browser.get("https://www.shareinvestor.com/my")
time.sleep(10)
login_main = browser.find_element_by_xpath("//*[@href='/user/login.html']").click()
time.sleep(10)
print browser.current_url
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("bkcollection")
password.send_keys("123456")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[@type='submit']")
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
time.sleep(900)
browser.close()
browser.quit()
display.stop()
zip_ref = zipfile.ZipFile(/home/vinvin/sh/KLSE, 'r')
zip_ref.extractall(/home/vinvin/sh/KLSE)
zip_ref.close()
os.remove(zip_ref)
如果这回答你的问题,请告诉我。
答案 2 :(得分:1)
原因是由于网页加载缓慢。我在打开网页链接后添加了等待20秒
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
它不会返回任何错误。
另外,
/zip
的MIME类型不正确。更改为profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
最终更正:
#!/usr/bin/python
print "hello from python 2"
import urllib2
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from pyvirtualdisplay import Display
import requests, zipfile, os
display = Display(visible=0, size=(800, 600))
display.start()
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
# application/zip not /zip
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
for retry in range(5):
try:
browser = webdriver.Firefox(profile)
print "firefox"
break
except:
time.sleep(3)
time.sleep(1)
browser.get("https://www.shareinvestor.com/my")
time.sleep(10)
login_main = browser.find_element_by_xpath("//*[@href='/user/login.html']").click()
print browser.current_url
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("bkcollection")
password.send_keys("123456")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[@type='submit']")
login_attempt.submit()
browser.get("https://www.shareinvestor.com/prices/price_download.html#/?type=price_download_all_stocks_bursa")
print browser.current_url
time.sleep(20)
dl = browser.find_element_by_xpath("//*[@href='/prices/price_download_zip_file.zip?type=history_all&market=bursa']").click()
time.sleep(30)
browser.close()
browser.quit()
display.stop()
zip_ref = zipfile.ZipFile('/home/vinvin/shKLSE/file.zip', 'r')
zip_ref.extractall('/home/vinvin/shKLSE')
zip_ref.close()
# remove with correct path
os.remove('/home/vinvin/shKLSE/file.zip')
答案 3 :(得分:1)
将其从硒的范围中取出。更改首选项设置,以便在单击链接时(首先检查链接是否有效),它会弹出一个要求保存的内容,现在使用sikuli http://www.sikuli.org/单击弹出窗口。 Mime类型并不总是有效,并且没有黑白回答为什么它不起作用。
答案 4 :(得分:1)
我还没有在你提到的网站上试过,但是下面的代码运行完美并下载了ZIP。如果您无法下载zip,Mime类型可能会有所不同。您可以使用Chrome浏览器和网络检查来检查您要下载的文件的 mime类型。
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', "/home/vinvin/shKLSE/")
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip')
browser = webdriver.Firefox(profile)
browser.get("http://www.colorado.edu/conflict/peace/download/peace.zip")