我写了一个Python脚本来使用多个(源)IP地址下载文件 - 请提出任何改进建议。
import cgi
import os
import posixpath
import Queue
import threading
import urllib
import urlparse
import random
import re
import shutil
import time
import requests
import requests_toolbelt
def get_IPs():
"""Returns all available IP addresses in a list."""
# TODO: Windows only. Other options?
out = []
for i in os.popen('ipconfig'):
i = i.strip()
if i.startswith('IP'):
out.append(i.rsplit(' ', 1)[-1])
return out
def get_info(url):
"""Returns name and size of file to be downloaded."""
try:
resp = requests.head(url, allow_redirects=True)
name = cgi.parse_header(resp.headers['content-disposition'])[1]['filename']
except KeyError:
path = urlparse.urlsplit(url).path
name = posixpath.basename(path)
name = urllib.unquote_plus(name)
size = int(resp.headers['content-length'])
return name, size
def worker(url, session, ud, part, size):
"""Downloads a part of the file specified by 'part' parameter."""
# TODO: optimal tries, timeout?
headers = {'range': 'bytes=%s-%s' % (
part*CHUNK_SIZE, min(size, (part + 1)*CHUNK_SIZE - 1))}
for _ in xrange(MAX_RETRIES):
try:
data = session.get(url, timeout=(2, 7), headers=headers).content
with open('%s/%04d' % (ud, part), 'wb') as udw:
udw.write(data)
break
except:
pass
else:
worker(url, sessions_queue.get(), ud, part, size)
sessions_queue.put(session)
def summary(name, size, elapsed):
"""Prints summary of the download after it is completed."""
print (
'--\n'
'%s download completed.\n'
'Time elapsed: %.2fs\n'
'Average download speed: %.2f MB/s\n'
'--' % (name, elapsed, size/elapsed/2**20))
def download(url):
"""Downloads the file pointed to by 'url' parameter."""
start = time.clock()
name, size = get_info(url)
# random id of length 20
ud = '%0x' % random.getrandbits(80)
os.mkdir(ud)
threads = []
for i in xrange(size/CHUNK_SIZE + (size%CHUNK_SIZE != 0)):
t = threading.Thread(target=worker, args=(url, sessions_queue.get(), ud, i, size))
threads.append(t)
t.start()
# characters \/:*?"<>| not allowed in filenames in Windows
name = re.sub(r'[\\/:*?"<>|]', '_', name)
# TODO: check if a file is already present with same name
out = open(name, 'ab')
for i, t in enumerate(threads):
t.join()
out.write(open('%s/%04d' % (ud, i), 'rb').read())
summary(name, size, time.clock() - start)
shutil.rmtree(ud)
def main():
IPs = get_IPs()
print len(IPs), 'IPs available.'
for ip in IPs:
adapter = requests_toolbelt.adapters.SourceAddressAdapter(ip)
session = requests.Session()
session.mount('http://', adapter)
session.mount('https://', adapter)
sessions_queue.put(session)
while True:
threading.Thread(target=download, args=(raw_input(),)).start()
if __name__ == '__main__':
sessions_queue = Queue.Queue()
KB = 1024
MB = 1024*KB
# TODO: optimal size?
CHUNK_SIZE = 100*KB
MAX_RETRIES = 2
main()
我在以太网上使用大约100个IP地址 - 每个速度大约100 KB / s。什么是最佳配置? (线程数,块大小)