使用Python请求下载文件

时间:2016-02-23 09:01:38

标签: python multithreading python-2.7 python-requests

我写了一个Python脚本来使用多个(源)IP地址下载文件 - 请提出任何改进建议。

import cgi
import os
import posixpath
import Queue
import threading
import urllib
import urlparse
import random
import re
import shutil
import time

import requests
import requests_toolbelt

def get_IPs():
    """Returns all available IP addresses in a list."""
    # TODO: Windows only. Other options?
    out = []
    for i in os.popen('ipconfig'):
        i = i.strip()
        if i.startswith('IP'):
            out.append(i.rsplit(' ', 1)[-1])

    return out

def get_info(url):
    """Returns name and size of file to be downloaded."""
    try:
        resp = requests.head(url, allow_redirects=True)
        name = cgi.parse_header(resp.headers['content-disposition'])[1]['filename']
    except KeyError:
        path = urlparse.urlsplit(url).path
        name = posixpath.basename(path)
    name = urllib.unquote_plus(name)
    size = int(resp.headers['content-length'])
    return name, size

def worker(url, session, ud, part, size):
    """Downloads a part of the file specified by 'part' parameter."""
    # TODO: optimal tries, timeout?
    headers = {'range': 'bytes=%s-%s' % (
        part*CHUNK_SIZE, min(size, (part + 1)*CHUNK_SIZE - 1))}
    for _ in xrange(MAX_RETRIES):
        try:
            data = session.get(url, timeout=(2, 7), headers=headers).content
            with open('%s/%04d' % (ud, part), 'wb') as udw:
                udw.write(data)
            break
        except:
            pass
    else:
        worker(url, sessions_queue.get(), ud, part, size)

    sessions_queue.put(session)

def summary(name, size, elapsed):
    """Prints summary of the download after it is completed."""
    print (
        '--\n'
        '%s download completed.\n'
        'Time elapsed: %.2fs\n'
        'Average download speed: %.2f MB/s\n'
        '--' % (name, elapsed, size/elapsed/2**20))

def download(url):
    """Downloads the file pointed to by 'url' parameter."""
    start = time.clock()
    name, size = get_info(url)
    # random id of length 20
    ud = '%0x' % random.getrandbits(80)
    os.mkdir(ud)
    threads = []
    for i in xrange(size/CHUNK_SIZE + (size%CHUNK_SIZE != 0)):
        t = threading.Thread(target=worker, args=(url, sessions_queue.get(), ud, i, size))
        threads.append(t)
        t.start()

    # characters \/:*?"<>| not allowed in filenames in Windows
    name = re.sub(r'[\\/:*?"<>|]', '_', name)
    # TODO: check if a file is already present with same name
    out = open(name, 'ab')
    for i, t in enumerate(threads):
        t.join()
        out.write(open('%s/%04d' % (ud, i), 'rb').read())

    summary(name, size, time.clock() - start)
    shutil.rmtree(ud)

def main():
    IPs = get_IPs()
    print len(IPs), 'IPs available.'
    for ip in IPs:
        adapter = requests_toolbelt.adapters.SourceAddressAdapter(ip)
        session = requests.Session()
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        sessions_queue.put(session)

    while True:
        threading.Thread(target=download, args=(raw_input(),)).start()


if __name__ == '__main__':
    sessions_queue = Queue.Queue()
    KB = 1024
    MB = 1024*KB
    # TODO: optimal size?
    CHUNK_SIZE = 100*KB
    MAX_RETRIES = 2
    main()

我在以太网上使用大约100个IP地址 - 每个速度大约100 KB / s。什么是最佳配置? (线程数,块大小)

0 个答案:

没有答案