Question

这是我第二次尝试并发HTTP刮板。我想做的是：一个HTTP抓取器，每个CPU核心使用1个进程，每个进程使用X个线程。

完整代码：

import threading
import multiprocessing as mp
import requests
from pprint import pprint
import json

MAX_THREADS = 100
REQUESTS_TIMEOUT = 20
MAX_PROCS = mp.cpu_count()
ENDPOINT = 'https://example.com'

seed_hashes = [
    'amwriting', 'lovewriting', 'poems', 'poemsporn'
]  # (Real version of script has a larger seed list)


#tag_queue = mp.Queue()
tag_queue = mp.Queue()
for seed in seed_hashes:
    tag_queue.put(seed)

MASTER = seed_hashes
TAG_COUNT = tag_queue.qsize()


print_lock = threading.Lock()


def tprint(*args, **kwargs):
    with print_lock:
        print(*args, **kwargs)


class Scraper:
    def __init__(self):
        self.file_lock = threading.Lock()
        self.master_lock = threading.Lock()
        self.isascii = lambda s: len(s) == len(s.encode())

    def queue_tag(self, tag):
        MASTER.append(tag)
        tprint("Hashtag count: ", str(len(MASTER)))
        tag_queue.put(tag)
        tprint("Added new tag: ", tag)
        tprint(f"{tag_queue.qsize()} in queue.")

    def parse_tags(self, source):
        j = json.loads(source)

        if j['exists']:
            for node in j['nodes']:
                tag = node['id']
                with self.master_lock:
                    if self.isascii(tag) and tag.isalnum() and tag not in MASTER:
                        self.queue_tag(tag)
            for edge in j['edges']:
                a = edge['a']
                with self.master_lock:
                    if self.isascii(a) and a.isalnum() and a not in MASTER:
                        self.queue_tag(a)

                b = edge['b']
                with self.master_lock:
                    if self.isascii(b) and b.isalnum() and b not in MASTER:
                        self.queue_tag(b)
            try:
                with self.file_lock:
                    with open("graph.json", "a") as fi:
                        fi.write(json.dumps(j))
            except Exception as e:
                tprint("File write exception", e)
                pass

    def scrape_page(self, tag):
        url = f"{ENDPOINT}{tag}"
        try:
            headers = requests.utils.default_headers()
            headers.update({
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36 OPR/62.0.3331.116',
                'authority': 'example.com',
                'origin': 'https://example.com',
                'referer': f'https://example.com?tag={tag}',
                'content-type': 'application/json'
            })
            res = requests.get(url, headers=headers, timeout=REQUESTS_TIMEOUT)  # was 30
            if res and res.status_code == 200:
                return res.text
            else:
                tprint("HTTP error for: ", tag)
                return self.scrape_page(tag)
        except Exception as e:
            tprint("Retrying after exception: ", e)
            return self.scrape_page(tag)

    def threader(self):
        tag = tag_queue.get()
        result = self.scrape_page(tag)
        self.parse_tags(result)

    def processor(self):
        threads = []
        for _ in range(MAX_THREADS):  # each Process creates a number of new Threads
            thread = threading.Thread(target=self.threader)
            threads.append(thread)
            thread.start()
        for thread in threads:
            thread.join()


if __name__ == "__main__":
    s = Scraper()
    while tag_queue.qsize():
        processes = []
        for _ in range(MAX_PROCS):
            p = mp.Process(target=s.processor)  # create a new Process
            p.start()
            processes.append(p)
        for process in processes:
            process.join()

我正在抓取很多数据，但同时我也得到了很多：

Retrying after exception:  HTTPSConnectionPool(host='example.com', port=443): Read timed out. (read timeout=20)

我也看到tprint("Hashtag count: ", str(len(MASTER)))行有问题。这个数字并没有像我预期的那样稳定增长。即使我从不删除列表中的任何项目，它也会增加和减少。

如何解决这两个问题？也欢迎其他改进。

多处理，线程请求刮板：读取超时和不一致

0 个答案: