我编写了从佛罗里达州紧急管理部门网站下载数据的代码。几个月以来,代码一直工作良好。但是今天当我运行它时,我得到下面的错误。我什至只测试了直接链接到其中一个文件的wget,仍然遇到相同的错误。我已经仔细检查了我的用户代理。我修改了标头。如果我使用wget从站点下载文件而不运行基于硒的脚本,则可以单独下载它们。我猜该网站阻止了我,因为它将我识别为机器人,但我不确定如何。谁能解释为什么这种情况仍在发生,我该怎么做才能解决问题?
用户代理:Mozilla / 5.0(Windows NT 10.0; Win64; x64)AppleWebKit / 537.36(KHTML,例如Gecko)Chrome / 85.0.4183.83 Safari / 537.36
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(chrome_path,options=chrome_options)
print('Starting Data Download')
link_counter = 0
download_counter = 0
link_n = len(result_full) -152
download_list = []
for links in result_full:
if links.text.find('Data Report') > 0:
link_url = links.get_attribute('href')
filename = wget.filename_from_url(link_url)
if not os.path.exists(f'{pdf_output_path}/{filename}'):
wget.download(link_url, out = f'{pdf_output_path}')
download_counter += 1
download_list.append
print("Downloading", links.text)
link_counter +=1
print (f'{round((link_counter)*100/link_n,2)}% Complete')
print('Download of New Files Complete')
print(f'{download_counter} Files Created')
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-112-3aafa377b505> in <module>
9 filename = wget.filename_from_url(link_url)
10 if not os.path.exists(f'{pdf_output_path}/{filename}'):
---> 11 wget.download(link_url, out = f'{pdf_output_path}')
12 download_counter += 1
13 download_list.append
E:\Anaconda\lib\site-packages\wget.py in download(url, out, bar)
524 else:
525 binurl = url
--> 526 (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
527 filename = detect_filename(url, out, headers)
528 if outdir:
E:\Anaconda\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
E:\Anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
E:\Anaconda\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
E:\Anaconda\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
E:\Anaconda\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
E:\Anaconda\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
E:\Anaconda\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
答案 0 :(得分:0)
如果您收到错误403,则表示您没有权限,具体取决于状态码。您可以在这里查看:https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
您可能需要在网站上进行某种身份验证,您可能需要通过应用发布一些凭据,然后尝试下载。