因此,当您使用以下脚本在session.get()之后打印某些内容时,有时会在同一行上打印(预期结果),但大多数情况下,它会在新行上打印。
我对asyncio / aiohttp的理解是,该过程是线性的,它可能会在某个位置停止并在其他地方开始,但是该过程不应添加随机的多行新行。我注意到,如果我使用www.example.com,它将按预期工作。但是,例如,如果使用沃尔玛。没有。沃尔玛确实需要更长的时间来加载,但是我认为这不会成为问题。
这应该是一个有效的示例,只需填写几个网址即可。
此外,如果有人可以给我更好的方法来做同样的事情,将受到赞赏,但不是这个问题的重点。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Webcrawler using Queues and asyncio."""
import asyncio
import aiohttp
from collections import Sequence
from timeit import default_timer
class NamedMutableSequence(Sequence):
__slots__ = ()
def __init__(self, *a, **kw):
slots = self.__slots__
for k in slots:
setattr(self, k, kw.get(k))
if a:
for k, v in zip(slots, a):
setattr(self, k, v)
def __str__(self):
clsname = self.__class__.__name__
values = ', '.join('%s=%r' % (k, getattr(self, k))
for k in self.__slots__)
return f'{clsname}({values})'
def __getitem__(self, item):
return getattr(self, self.__slots__[item])
def __setitem__(self, item, value):
return setattr(self, self.__slots__[item], value)
def __len__(self):
return len(self.__slots__)
__repr__ = __str__
class Item(NamedMutableSequence):
__slots__ = ('url', 'retries')
class Crawler:
def __init__(self, consumers=5, max_conn=15):
"""Initialize the Crawler Class"""
self.callback = None
self.queue = None
self.sem = None
self.responses = None
self.consumers = consumers
self.max_conn = max_conn
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.loop)
def crawl(self, urls, callback=None):
start = default_timer()
self.queue, self.dlq, self.responses = asyncio.Queue(maxsize=3), asyncio.Queue(), []
self.callback = callback
self.loop.run_until_complete(self.run(urls))
self.loop.close()
end = default_timer()
print(f'Operation took {end-start:.2f}s to complete.')
async def run(self, urls):
async with aiohttp.ClientSession(loop=self.loop, connector=aiohttp.TCPConnector(limit=self.max_conn)) as session:
consumers = [asyncio.ensure_future(self.consume(self.queue, self.dlq, session, name="consumers - 0" + str(_))) for _ in range(self.consumers)]
dlq_consumers = [asyncio.ensure_future(self.consume(self.dlq, self.dlq, session, name="DQL - 0" + str(_))) for _ in range(3)]
await self.produce(urls)
await self.queue.join()
await self.dlq.join()
for future in consumers + dlq_consumers:
future.cancel()
async def produce(self, urls):
for url in urls:
item = Item(url=url, retries=0)
await self.queue.put(item)
async def consume(self, main, dlq, session=None, name=None):
while True:
try:
item = await main.get()
if item.retries > 3:
print(f'{item.url} retried more than 3 times.')
self.responses.append((False, {}))
main.task_done()
else:
async with session.get(item.url) as response:
print(f'This should have a newline despite end=""', end='')
if response.status != 200:
item.retries += 1
main.task_done()
await asyncio.sleep(1)
dlq.put(item)
else:
res = await self.callback(item, response)
main.task_done()
self.responses.append((True, res))
except asyncio.TimeoutError:
print(f'Timeout')
item.retries += 1
main.task_done()
await asyncio.sleep(1)
await dlq.put(item)
urls = []
async def callback(item, response):
return response.text()
crawler = Crawler().crawl(urls, callback)