Question

我需要从前一百万个域中获取HTTP GET响应，并且我想打开尽可能多的并发线程，以便更快地完成它。我发现的唯一相关的帖子是What is the fastest way to send 100,000 HTTP requests in Python? 并且该解决方案使用并发功能。futures按预期工作。

但是，问题在于，当我将工人数量设置得更高时，性能提升似乎停滞了，即，如果我将工人数量设置为1000或10,000，我不会感到任何差异。我在付费EC2实例上运行它，可以看到我只使用了一部分可用的CPU和内存。不知道发生了什么，我可以创建多少个并发线程是否有限制？我可以超越限制吗？

Answer 1

我发现urllib3和请求之间没有太大区别（请求可能快一点）。我将使用一个异步库，因为这是一个主要用例。

from gevent import monkey, spawn, joinall
monkey.patch_all()
import urllib3, certifi
from time import time

threads = []
url = 'https://www.google.com'
upool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), num_pools=20,  block=False)

t0 = time()
for i in xrange(10000):
    threads.append(spawn(upool.request,'GET',url))

x = joinall(threads)

print len(x)
print time() - t0

请注意，您可以通过在true上添加block来限制使用的连接数。

*多进程更新*

from gevent import monkey, spawn, joinall
monkey.patch_all()
import urllib3, certifi
from time import time
import gipc

worker = {}
num_threads = 1000

def fetch(num_threads, url, cpu):
    print('starting {}'.format(cpu))
    threads = []
    upool = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), num_pools=20, block=False)
    t0 = time()
    for i in xrange(num_threads):
        threads.append(spawn(upool.request, 'GET', url))
    x = joinall(threads)
    return x, time() - t0

def count_cpus():
    import multiprocessing
    cpus = multiprocessing.cpu_count()
    print(cpus)
    return cpus

def multicore(url):
    global worker
    with gipc.pipe() as (r,w):
        for cpu in range(count_cpus()):
            worker[str(cpu)] = gipc.start_process(target=fetch, args=(num_threads, url, cpu))
    for work in worker:
        worker[work].join()
    return worker

if __name__ == '__main__':
    multicore('https://www.google.com')

    for work in worker:
        print worker[work]

如何使用Python同时发送10,000个HTTP请求

1 个答案: