这是我第二次尝试并发HTTP刮板。我想做的是:一个HTTP抓取器,每个CPU核心使用1个进程,每个进程使用X个线程。
完整代码:
import threading
import multiprocessing as mp
import requests
from pprint import pprint
import json
MAX_THREADS = 100
REQUESTS_TIMEOUT = 20
MAX_PROCS = mp.cpu_count()
ENDPOINT = 'https://example.com'
seed_hashes = [
'amwriting', 'lovewriting', 'poems', 'poemsporn'
] # (Real version of script has a larger seed list)
#tag_queue = mp.Queue()
tag_queue = mp.Queue()
for seed in seed_hashes:
tag_queue.put(seed)
MASTER = seed_hashes
TAG_COUNT = tag_queue.qsize()
print_lock = threading.Lock()
def tprint(*args, **kwargs):
with print_lock:
print(*args, **kwargs)
class Scraper:
def __init__(self):
self.file_lock = threading.Lock()
self.master_lock = threading.Lock()
self.isascii = lambda s: len(s) == len(s.encode())
def queue_tag(self, tag):
MASTER.append(tag)
tprint("Hashtag count: ", str(len(MASTER)))
tag_queue.put(tag)
tprint("Added new tag: ", tag)
tprint(f"{tag_queue.qsize()} in queue.")
def parse_tags(self, source):
j = json.loads(source)
if j['exists']:
for node in j['nodes']:
tag = node['id']
with self.master_lock:
if self.isascii(tag) and tag.isalnum() and tag not in MASTER:
self.queue_tag(tag)
for edge in j['edges']:
a = edge['a']
with self.master_lock:
if self.isascii(a) and a.isalnum() and a not in MASTER:
self.queue_tag(a)
b = edge['b']
with self.master_lock:
if self.isascii(b) and b.isalnum() and b not in MASTER:
self.queue_tag(b)
try:
with self.file_lock:
with open("graph.json", "a") as fi:
fi.write(json.dumps(j))
except Exception as e:
tprint("File write exception", e)
pass
def scrape_page(self, tag):
url = f"{ENDPOINT}{tag}"
try:
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36 OPR/62.0.3331.116',
'authority': 'example.com',
'origin': 'https://example.com',
'referer': f'https://example.com?tag={tag}',
'content-type': 'application/json'
})
res = requests.get(url, headers=headers, timeout=REQUESTS_TIMEOUT) # was 30
if res and res.status_code == 200:
return res.text
else:
tprint("HTTP error for: ", tag)
return self.scrape_page(tag)
except Exception as e:
tprint("Retrying after exception: ", e)
return self.scrape_page(tag)
def threader(self):
tag = tag_queue.get()
result = self.scrape_page(tag)
self.parse_tags(result)
def processor(self):
threads = []
for _ in range(MAX_THREADS): # each Process creates a number of new Threads
thread = threading.Thread(target=self.threader)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == "__main__":
s = Scraper()
while tag_queue.qsize():
processes = []
for _ in range(MAX_PROCS):
p = mp.Process(target=s.processor) # create a new Process
p.start()
processes.append(p)
for process in processes:
process.join()
我正在抓取很多数据,但同时我也得到了很多:
Retrying after exception: HTTPSConnectionPool(host='example.com', port=443): Read timed out. (read timeout=20)
我也看到tprint("Hashtag count: ", str(len(MASTER)))
行有问题。这个数字并没有像我预期的那样稳定增长。即使我从不删除列表中的任何项目,它也会增加和减少。
如何解决这两个问题?也欢迎其他改进。