我试图使用selenium下载文件,如: ftp://ftp.sec.gov/edgar/full-index/1993/QTR1/form.idx ftp://ftp.sec.gov/edgar/full-index/2004/QTR1/form.idx
这些只是内部纯文本文件,但它们奇怪的扩展使我非常头痛。浏览器总是调用一些插件来读取文件,我不知道MIME类型是什么" idx"?
在网上搜索后,我认为一种简单的方法是设置firefox配置文件:
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.dir', cachedir)
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/pdf, text/plain, application/vnd.idx, application/xml, application/octet-stream, text/html, application/vnd.oasis.opendocument.text-web, application/rtf, text/richtext, application/xhtml+xml')
profile.set_preference('plugin.disable_full_page_plugin_for_types', 'application/pdf, text/plain, application/vnd.idx, application/xml, application/octet-stream, text/html, application/vnd.oasis.opendocument.text-web, application/rtf, text/richtext, application/xhtml+xml')
profile.set_preference('browser.helperApps.alwaysAsk.force', False)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('pdfjs.disabled', True)
return webdriver.Firefox(profile)
我试图把我能想到的几乎所有东西都放在属性上" browser.helperApps.neverAsk.saveToDisk"和" plugin.disable_full_page_plugin_for_types",但似乎都没有达到目标。
有谁知道在这里放置正确的MIME是什么?或者更一般地说,我们如何知道任意文件的MIME类型(请注意某些文件扩展名不是标准文件)?
我的完整代码如下:
from bs4 import BeautifulSoup
import time
import os
from selenium import webdriver
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
def get_browser(cachedir):
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.dir', cachedir)
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/pdf, text/plain, application/vnd.idx, application/xml, application/octet-stream, text/html, application/vnd.oasis.opendocument.text-web, application/rtf, text/richtext, application/xhtml+xml, text/x-mail')
profile.set_preference('plugin.disable_full_page_plugin_for_types', 'application/pdf, text/plain, application/vnd.idx, application/xml, application/octet-stream, text/html, application/vnd.oasis.opendocument.text-web, application/rtf, text/richtext, application/xhtml+xml, text/x-mail')
profile.set_preference('browser.helperApps.alwaysAsk.force', False)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('pdfjs.disabled', True)
return webdriver.Firefox(profile)
def write_content(page_source, file_path):
soup = BeautifulSoup(page_source)
form_content = soup.find_all("body")[0].text
print("getting {}".format(file_path))
with open(file_path, "w") as f_out:
f_out.write(form_content.encode('utf-8'))
cachedir = "/Users/voiceup/Desktop"
form_dir = "forms/"
browser = get_browser(cachedir)
for year in range(1993, 2015):
for qtr in range(1, 5):
year = str(year)
qtr = str(qtr)
url = "ftp://ftp.sec.gov/edgar/full-index/" + year + "/QTR" + qtr + "/form.idx"
browser.get(url)
# alert means there is broken file
# refresh the browser until there is no alert
has_alert = True
while has_alert:
try:
WebDriverWait(browser, 2).until(EC.alert_is_present())
alert = browser.switch_to_alert()
alert.accept()
print("alert accepted")
browser.refresh()
except TimeoutException:
has_alert = False
# manually download the file
file_name = year + "_" + qtr + ".txt"
file_path = os.path.join(form_dir, file_name)
write_content(browser.page_source, file_path)
time.sleep(2)
browser.quit()
感谢。
答案 0 :(得分:1)
Selenium
绝对不是工作的工具 - 它为问题增加了巨大的开销。
在这种情况下,ftplib
非常适合:
import os
import ftplib
form_dir = "forms/"
ftp = ftplib.FTP('ftp.sec.gov', 'anonymous')
for year in range(1993, 2015):
for qtr in range(1, 5):
url = "edgar/full-index/{year}/QTR{qtr}/form.idx".format(year=year, qtr=qtr)
filename = "{year}_{qtr}.txt".format(year=year, qtr=qtr)
print "Process URL: " + url
# manually download the file
with open(os.path.join(form_dir, filename), "wb") as file:
ftp.retrbinary("RETR " + url, file.write)
ftp.close()
运行脚本时,您会看到在forms/
目录中创建的文件,以下内容将打印在控制台上:
Process URL: edgar/full-index/1993/QTR1/form.idx
Process URL: edgar/full-index/1993/QTR2/form.idx
Process URL: edgar/full-index/1993/QTR3/form.idx
Process URL: edgar/full-index/1993/QTR4/form.idx
Process URL: edgar/full-index/1994/QTR1/form.idx
...