在async + aiohttp + lxml上有一个解析器
import asyncio
import aiohttp
import os
import time
import logging
import lxml.html
from concurrent.futures import ThreadPoolExecutor
from contextlib import suppress
from mysql.connector import MySQLConnection, Error
log = logging.getLogger(__name__)
format = '%(asctime)s %(levelname)s:%(message)s'
logging.basicConfig(format=format, level=logging.INFO)
logging.basicConfig(level=logging.DEBUG)
limit = asyncio.Semaphore(100)
headers = {'Accept': 'text/html,application/xhtml+xml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}
url_main_list = ["https://technical.city/en/cpu/history", "https://technical.city/en/video/history"]
main_type = "https://technical.city"
def to_string(node):
return lxml.html.tostring(node, encoding='unicode')
async def request(client, url):
global limit, headers
while True:
async with limit:
try:
async with client.get(url, headers=headers) as r:
log.info('Запрос: %s', url)
return await r.text()
except:
time.sleep(1)
def get_html (request):
return lxml.html.fromstring(request)
async def crawl(future, client, pool):
futures = []
urls = await future
for request_future in asyncio.as_completed([request(client, url) for url in urls]):
parse_future = loop.run_in_executor(pool, parse, (await request_future))
futures.append(asyncio.ensure_future(crawl(parse_future, client, pool)))
if futures:
await asyncio.wait(futures)
async def start_main(root_urls):
loop = asyncio.get_event_loop()
with ThreadPoolExecutor(max_workers=30) as pool:
async with aiohttp.ClientSession() as client:
initial_future = loop.create_future()
initial_future.set_result(root_urls)
await crawl(initial_future, client, pool)
def parse(page_text):
global count
urls = []
html = get_html(page_text)
url = html.cssselect('link[rel = "canonical"]')[0].get('href')
print("URL:", url)
breadcrumbs = html.cssselect('div.breadcrumbs>span.almost_bold')[0].text
if (breadcrumbs == "History"):
pagination = int(html.cssselect('div.rating_pagination.pagination>span')[0].text)
if(pagination == 1):
a_list = html.cssselect('div.rating_pagination.pagination>a')
for a in a_list:
urls.append(main_type + a.get("href"))
href_list = []
a_list = html.cssselect('table.rating.responsive>tr:not([class])>td[style="text-align:left"]>a')
for a in a_list:
href_list.append(main_type + a.get("href"))
href_list = list(set(href_list))
for href in href_list:
urls.append(href)
else:
type = html.cssselect('div.breadcrumbs>span[itemprop = "itemListElement"]')[1].cssselect('a')[0].get('title')
group_list = html.cssselect('div.tbt2.row_heading>div>h2')[1:]
title = html.cssselect('meta[property = "og:title"]')[0].get("content")
tables = html.cssselect('div.tbt1.single>div.table')
for index, table in enumerate(tables):
tbts = html.cssselect('div.tbt5')
for tbt in tbts:
divs = tbt.cssselect('div')
one_block = divs[1].text
if(one_block == None):
break
two_block = divs[2].text
if(two_block == None):
try:
two_block = divs[2].cssselect('span')[0].text
except:
two_block = divs[2].cssselect('a')[0].text
if(two_block == "+"):
two_block = 1
else:
if(two_block == "-"):
two_block = 0
group = group_list[index].text
print(one_block, ":", two_block)
return urls
if __name__ == '__main__':
start_time = time.time()
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(start_main(url_main_list))
except KeyboardInterrupt:
for task in asyncio.Task.all_tasks():
task.cancel()
with suppress(asyncio.CancelledError):
loop.run_until_complete(task)
finally:
loop.close()
log.info('Time work: %s',time.time() - start_time)
在parse方法中,确定并解析页面类型。解析器完成(超过2小时)后,可能是由于错误,消息出现了
错误:
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6CA8>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6D38>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6D08>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6C78>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6BE8>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6C48>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6B88>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x00000218EFBE6978>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x0000021959BFF468>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,933 ERROR:Task was destroyed but it is pending!
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:186> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x0000021959BFF3A8>()]> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436]>
2018-11-16 08:13:35,935 ERROR:Task was destroyed but it is pending!
在调试模式下:
2018-11-16 21:45:18,943 ERROR:Task was destroyed but it is pending!
source_traceback: Object created at (most recent call last):
File "D:/Git/technical.city.py", line 222, in <module>
loop.run_until_complete(start_main(url_main_list))
File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 555, in run_until_complete
self.run_forever()
File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 523, in run_forever
self._run_once()
File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1750, in _run_once
handle._run()
File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 88, in _run
self._context.run(self._callback, *self._args)
File "D:/Git/technical.city.py", line 193, in crawl
futures.append(asyncio.ensure_future(crawl(parse_future, client, pool)))
task: <Task pending coro=<crawl() running at D:/Git/technical.city.py:187> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:348, <TaskWakeupMethWrapper object at 0x000002A0E1BE6A08>()] created at C:\ProgramData\Anaconda3\lib\asyncio\base_events.py:377> cb=[_wait.<locals>._on_completion() at C:\ProgramData\Anaconda3\lib\asyncio\tasks.py:436] created at D:/Git/technical.city.py:193>
在调试模式下,链接很少
2018-11-17 19:06:51,230 WARNING:Executing <Handle <TaskWakeupMethWrapper object at 0x0000018B98CB8FD8>(<Future finis...events.py:377>) created at C:\ProgramData\Anaconda3\lib\asyncio\futures.py:323> took 0.125 seconds
2018-11-17 19:06:59,041 WARNING:Executing <Handle <TaskWakeupMethWrapper object at 0x0000018B994C5DF8>(<Future finis...events.py:377>) created at C:\ProgramData\Anaconda3\lib\asyncio\queues.py:66> took 0.266 seconds
2018-11-17 19:07:00,849 WARNING:Executing <Handle <TaskWakeupMethWrapper object at 0x0000018B990C9E58>(<Future finis...events.py:377>) created at C:\ProgramData\Anaconda3\lib\asyncio\queues.py:66> took 0.203 seconds
2018-11-17 19:07:02,770 WARNING:Executing <Handle <TaskWakeupMethWrapper object at 0x0000018B994D7378>(<Future finis...events.py:377>) created at C:\ProgramData\Anaconda3\lib\asyncio\queues.py:66> took 0.265 seconds
2018-11-17 19:07:03,600 WARNING:Executing <Handle <TaskWakeupMethWrapper object at 0x0000018B9910EE28>(<Future finis...events.py:377>) created at C:\ProgramData\Anaconda3\lib\asyncio\queues.py:66> took 0.500 seconds
2018-11-17 19:07:04,055 WARNING:Executing <Handle <TaskWakeupMethWrapper object at 0x0000018B990B7FD8>(<Future finis...events.py:377>) created at C:\ProgramData\Anaconda3\lib\asyncio\queues.py:66> took 0.453 seconds
2018-11-17 19:07:05,587 INFO:poll 2735.000 ms took 1531.000 ms: 1 events
2018-11-17 19:07:08,990 INFO:poll 14063.000 ms took 1250.000 ms: 1 events
2018-11-17 19:07:10,255 INFO:poll 12797.000 ms took 1250.000 ms: 1 events
2018-11-17 19:07:15,466 INFO:poll 11500.000 ms took 5171.000 ms: 1 events
2018-11-17 19:07:17,050 INFO:poll 5797.000 ms took 1047.000 ms: 1 events
2018-11-17 19:07:19,574 INFO:poll 4563.000 ms took 2344.000 ms: 1 events
2018-11-17 19:07:20,964 INFO:poll 2204.000 ms took 1375.000 ms: 1 events
2018-11-17 19:07:22,120 WARNING:Executing <TimerHandle cancelled when=149422 _weakref_handle((<weakref at 0...0018B98CDCAC8>, '_cleanup')) at C:\ProgramData\Anaconda3\lib\site-packages\aiohttp\helpers.py:579 created at C:\ProgramData\Anaconda3\lib\site-packages\aiohttp\helpers.py:593> took 0.328 seconds
2018-11-17 19:07:25,250 INFO:poll took 1563.000 ms: 1 events
我知道start_parse方法的完成速度比任务快,但我不知道如何解决它。对代码提出建议或提出建议。 版本Python:3.7