我希望可能有来自不同域的10万个URL。我编写了这段代码,其中包含all_urls
中的URL列表,并形成了N
线程以批量运行。目前,我正在使用threading
模块来并行发出这些请求。
import requests
import os
import threading
import time
all_urls = [] # a list of URLs to request, can have up to 100k
global success, fail
success = 0
fail = 0
def func(url_to_request):
global success, fail
try:
r = requests.get(url_to_request, timeout=5)
c = r.content
success = success +1
except:
fail = fail +1
return
batch_count = 1
N = 200 # number of threads
all_threads_urls = []
time_start = time.time()
for item in all_urls:
all_threads_urls.append(item)
if all_urls.index(item) == len(all_urls)-1 or len(all_threads_urls) == N:
# call it
all_threads = []
for link in all_threads_urls:
current_thread = threading.Thread(target=func, args=(link,))
all_threads.append(current_thread)
current_thread.start()
for thr in all_threads:
thr.join()
all_threads_urls = [] # for the next batch
time_end = time.time()
print "Request number", all_urls.index(item)+1, "Good:", success, "Bad:", fail, "Duration:", round(time_end - time_start,2 ), "seconds."
time_start = time_end
这样做的结果有点怪异,似乎脚本启动非常快,但随后却放慢了很多(见图)。打印的持续时间适用于每批。
有人可以解释这里的瓶颈吗?也许有更好的模块可以解决这个问题吗?