加快多线程下载

时间:2019-07-11 04:22:32

标签: python multithreading http urllib

我编写了一个Python脚本,该脚本将从网站上下载文件。为了加快速度,我使文件的下载成为多线程的。显然,这比串行下载要快,但是我遇到了一些我无法解释的效果。

  1. 下载的第一个x文件(似乎与创建的线程数量成比例)非常快-输出每秒显示40个文件,但之后速度大大降低。
  2. 在某种程度上(接近200个线程),我每秒可以下载文件的最大速度为10个文件。如果我将线程数增加到700,则仍然以每秒10个文件的速度最大。将线程数增加到非常大的数量(超过1000个)似乎会限制基于CPU速度的下载速度。

所以,我的问题是:

  1. 为什么我下载的第一个文件与其余文件相比下载得这么快,我可以保持原始速度吗?
  2. 为什么线程计数的下载速度回报如此递减?

这是我的剧本:

#!/usr/bin/python

import inspect
import math
from queue import Queue
from urllib.request import ProxyHandler, build_opener
from ast import literal_eval
from time import time, sleep
from datetime import timedelta
import random
from threading import Thread, activeCount
import os

proxies = Queue()
threads = Queue()
agents = []
total_files = 0
finished_files = 0
downloaded_files = 0
start_time = 0

class Config(object):
    DEBUG = False
    PROXIES_PATH = '/home/shane/bin/proxies.txt'
    AGENTS_PATH = '/home/shane/bin/user-agents.txt'
    DESTINATION_PATH = '/home/shane/images/%d.jpg'
    SOURCE_URL = 'https://example.org/%d.jpg'
    MAX_THREADS = 500
    TIMEOUT = 62
    RETRIES = 1
    RETRIES_TIME = 1

def get_files_per_second():
    return float(downloaded_files) / (time() - start_time)

def get_time_remaining():
    delta = timedelta(seconds=float(total_files - finished_files) / get_files_per_second())
    seconds = delta.total_seconds()
    days, remainder = divmod(seconds, 86400)
    hours, remainder = divmod(remainder, 3600)
    minutes, seconds = divmod(remainder, 60)
    days = str(int(days)).zfill(2)
    hours = str(int(hours)).zfill(2)
    minutes = str(int(minutes)).zfill(2)
    seconds = str(int(seconds)).zfill(2)
    return "%s:%s:%s:%s" % (days, hours, minutes, seconds)

def release_proxy(opener):
    if Config.DEBUG:
        print('Releasing proxy')
    for handler in opener.handlers:
        if type(handler) is ProxyHandler:
            proxies.put(handler)
            return
    raise Exception('No proxy found')

def get_new_proxy():
    if Config.DEBUG:
        print('Getting new proxy')
    if proxies.empty():
        raise Exception('No proxies')
    return proxies.get()

def get_new_agent():
    if len(agents) == 0:
        raise Exception('No user agents')
    return random.choice(agents)

def get_new_opener():
    opener = build_opener(get_new_proxy())
    opener.addheaders = [('User-Agent', get_new_agent())]
    return opener

def download(opener, source, destination, tries=0):
    global finished_files, downloaded_files
    if Config.DEBUG:
        print('Downloading %s to %s' % (source, destination))
    try:
        result = opener.open(source, timeout=Config.TIMEOUT).read()
        with open(destination, 'wb') as d:
            d.write(result)
        release_proxy(opener)
        finished_files += 1
        downloaded_files += 1
        to_print = '(%d/%d files) (%d proxies) (%f files/second, %s left) (%d threads) %s'
        print(to_print % (finished_files, total_files, proxies.qsize(), round(get_files_per_second(), 2), get_time_remaining(), activeCount(), source))
    except Exception as e:
        if Config.DEBUG:
            print(e)
        if tries < Config.RETRIES:
            sleep(Config.RETRIES_TIME)
            download(opener, source, destination, tries + 1)
        else:
            if proxies.qsize() < Config.MAX_THREADS * 2:
                release_proxy(opener)
            download(get_new_opener(), source, destination, 0)

class Downloader(Thread):
    def __init__(self, source, destination):
        Thread.__init__(self)
        self.source = source
        self.destination = destination
    def run(self):
        if Config.DEBUG:
            print('Running thread')
        download(get_new_opener(), self.source, self.destination)
        if threads.qsize() > 0:
            threads.get().start()

def populate_proxies():
    if Config.DEBUG:
        print('Populating proxies')
    with open(Config.PROXIES_PATH, 'r') as fh:
        for line in fh:
            line = line.replace('\n', '')
            if Config.DEBUG:
                print('Adding %s to proxies' % line)
            proxies.put(ProxyHandler(literal_eval(line)))

def populate_agents():
    if Config.DEBUG:
        print('Populating agents')
    with open(Config.AGENTS_PATH, 'r') as fh:
        for line in fh:
            line = line.replace('\n', '')
            if Config.DEBUG:
                print('Adding %s to agents' % line)
            agents.append(line)

def populate_threads():
    global total_files, finished_files
    if Config.DEBUG:
        print('Populating threads')
    for x in range(0, 100000):
        destination = Config.SOURCE_URL % x
        # queue threads
        print('Queueing %s' % destination)
        threads.put(Downloader(source, destination))

def start_work():
    global start_time
    if threads.qsize() == 0:
        raise Exception('No work to be done')
    start_time = time()
    for x in range(0, min(threads.qsize(), Config.MAX_THREADS)):
        if Config.DEBUG:
            print('Starting thread %d' % x)
        threads.get().start()

populate_proxies()
populate_agents()
populate_threads()
start_work()

1 个答案:

答案 0 :(得分:1)

没有。您正在使用的线程数量非常高,python实际上并不并行运行线程,它只是经常在线程之间切换,这似乎是并行线程。 如果任务是CPU密集型,则使用多处理,否则,如果任务是I / O密集型,则线程将很有用。 在普通的四核PC,8GB内存上,将线程数保持在低水平(10-70),否则切换时间将降低代码的速度。 检查以下2个链接:

  1. Stack Over Flow Question
  2. Executive Summary On this page.