下载并处理错误

时间:2018-10-13 21:31:14

标签: python web-scraping download

我一直在使用Ryan Mitchell的O'Really的《 Web Scraping with Python》一书中的函数:

import  sys
import  os.path
import  socket
import  random
import  urllib2
import  contextlib
import  diskCache
import  logging as logger
from bs4 import BeautifulSoup

DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout (DEFAULT_TIMEOUT)

def  download (url, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, \
        cache=None, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, data=None):
    result = None
    if  cache:
        try:
            result = cache[url]
        except  KeyError:
            # url is not available in cache
            pass
        if  result is not  None  and  result['code'] is not None \
                and  num_retries > 0  and  500 <= result['code'] < 600:
            # server error so ignore result from cache and re-download
            result = None
    if result is None:
        proxy = random.choice(proxies) if proxies else None
        headers = {'User-agent': user_agent}
        result = call (url, headers, proxy=proxy, num_retries=num_retries, cache=cache)
        if  cache:
            # save result to cache
            cache[url] = result

    return  result['html']

def  call (url, headers, proxy, num_retries, cache=None, data=None):
    request = urllib2.Request(url, data, headers or {})
    with  contextlib.closing (urllib2.urlopen(request))  as  connection:
        try:
            logger.info ('Downloading: %s', url)
            html = connection.read ()
            code = connection.getcode ()
        except  Exception as e:
            logger.exception ('Download error:', str(e))
            if  cache:
                del  cache['url']
            html = None
            if  hasattr (e, 'code'):
                code = e.code
                if  num_retries > 0  and  500 <= code < 600:
                    return  download (url, headers, num_retries-1, data) # retry server errors
            else:
                code = None
    return {'html': html, 'code':code}

我想知道在下载URL时是否有更简单的方法来处理错误。我已经看到requests库是一个更高级别且更轻松的库,也许可以简化此过程。至少该代码将如何用于python3?

那会是

"""Functions used by the fetch module"""

# Standard library imports
import time
import socket
import logging as logger
from typing import Dict, Optional

# Third party imports
import requests
from requests.exceptions import HTTPError, Timeout
from bs4 import BeautifulSoup

# Constants
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout(DEFAULT_TIMEOUT)

def fetch(url: str, retries: Optional[int] = DEFAULT_RETRIES) -> Dict:
    """Download an url"""
    code = None
    try:
        logger.info('Downloading: %s', url)
        resp = requests.get(url)
        resp.raise_for_status()
        code = resp.status_code
    except (HTTPError, Timeout) as ex:
        logger.exception("Couldn't download %s", ex)
        return None
    if code is not None and retries > 0 and \
            500 <= code < 600: # Server error
        logger.info('Retrying download')
        time.sleep(DEFAULT_DELAY)
        return fetch(url, retries-1)

    return {'html': resp, 'code': code}

1 个答案:

答案 0 :(得分:0)

正如您所说,requests

resp = requests.get(url, headers=headers, timeout=timeout)
print(resp.status_code)
print(resp.text)
# for an API use resp.json()

默认情况下没有引发异常。如果您确实想引发异常,可以致电resp.raise_for_status()

有关详情,请参见http://docs.python-requests.org/en/master/user/quickstart/