我正在尝试使用python(3.5)报道包从CNN下载文章。计划如下:
from newspaper import Article
from newspaper import Config, Article, Source
import requests
c = Config()
c.browser_user_agent = 'test/1.1'
c.http_success_only = True
URL = 'http://rss.cnn1sdfs.com/c/35494/f/676993/s/4ddb6116/sc/14/l/0Ledition0Bcnn0N0C20A160C0A20C250Chealth0Ce0Ecigarette0Eexplodes0Ein0Emans0Epocket0Cindex0Bhtml0Deref0Fedition/story01.htm'
a = Article(url=URL,config=c)
a.download()
# response = requests.get(URL)
# response.raise_for_status()
即使DNS无法解析,此程序也不会失败!
我查看了报纸代码here
def get_html(url, config=None, response=None):
"""Retrieves the html for either a url or a response object. All html
extractions MUST come from this method due to some intricies in the
requests module. To get the encoding, requests only uses the HTTP header
encoding declaration requests.utils.get_encoding_from_headers() and reverts
to ISO-8859-1 if it doesn't find one. This results in incorrect character
encoding in a lot of cases.
"""
FAIL_ENCODING = 'ISO-8859-1'
config = config or Configuration()
useragent = config.browser_user_agent
timeout = config.request_timeout
if response is not None:
if response.encoding != FAIL_ENCODING:
return response.text
return response.content
try:
html = None
response = requests.get(
url=url, **get_request_kwargs(timeout, useragent))
if response.encoding != FAIL_ENCODING:
html = response.text
else:
html = response.content
if config.http_success_only:
response.raise_for_status() # fail if other than "ok" response
if html is None:
html = ''
return html
except requests.exceptions.RequestException as e:
log.debug('%s on %s' % (e, url))
return ''
基于上面的代码,对response.raise_for_status()的调用应该导致一个未处理的异常,因此程序应该失败。
为了证明这一点,我也尝试直接使用请求:
from newspaper import Article
from newspaper import Config, Article, Source
import requests
c = Config()
c.browser_user_agent = 'test/1.1'
c.http_success_only = True
URL = 'http://rss.cnn1sdfs.com/c/35494/f/676993/s/4ddb6116/sc/14/l/0Ledition0Bcnn0N0C20A160C0A20C250Chealth0Ce0Ecigarette0Eexplodes0Ein0Emans0Epocket0Cindex0Bhtml0Deref0Fedition/story01.htm'
# a = Article(url=URL,config=c)
# a.download()
response = requests.get(URL)
response.raise_for_status()
这个程序失败了!
你能帮我理解为什么我的第一个程序没有失败以及如何修复它?我错过了一些明显的东西吗?