python aiohttp请求很多URL被阻止的问题

时间:2019-06-01 14:27:36

标签: python aiohttp

我正在读取40MB(一百万行)的网址,并访问它们以在页面上找到字符。当我访问时,我发现该请求将始终被暂停或阻止,无法确定要等待什么?等待请求的响应? 该代码是在github上找到的并发应用程序,然后对其进行了修改。您能帮我看看问题出在哪里吗?谢谢〜

#!/usr/bin/python3
# encoding: utf-8
# time : 2019/6/1 20:31

import asyncio
from itertools import islice

import aiofiles as aiofiles
from aiohttp import ClientSession

try:
    import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}


def iter_list():
    all_url = []
    with open("Domain_list.txt") as input_file:
        for line in islice(input_file, 0, None):
            all_url.append(line.strip())

        return all_url


async def save_txt(url):
    async with aiofiles.open('ok.txt', 'a+') as f:
        await f.write(url + '\n')


async def fetch(item, session):
    try:
        async with session.post(url=item, headers=headers, timeout=None) as req:
             """
                Find character '<<<EOT' And save
            """
            code = '<<<EOT'
            html = await req.text()
            if req.status in [200, 201]:
                if html.find(code) > 2:
                    await save_txt(item)
                    await print(item, html.find(code))
                else:
                    await print(f'{item}not!')
            else:
                await print(f'{item},{req.status()}')

    except:
        pass


async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        await fetch(url, session)


async def run(lists):
    tasks = []
    # create instance of Semaphore
    sem = asyncio.Semaphore(500)

    # Create client session that will ensure we dont open new connection
    # per each request.
    async with ClientSession() as session:
        for i in lists:
            # pass Semaphore and session to every GET request
            task = asyncio.ensure_future(bound_fetch(sem, i, session))
            tasks.append(task)

        responses = asyncio.gather(*tasks)
        await responses


loop = asyncio.get_event_loop()
lists = iter_list()
future = asyncio.ensure_future(run(lists))
loop.run_until_complete(future)

0 个答案:

没有答案