我试图从leaderboard异步地抓取视频游戏中的数据。每周和每天都有挑战。到目前为止,我已经在this async client with semaphores上创建了我的代码。不同之处在于我试图包含在函数中使用循环的结尾。这是我的代码的相关部分:
from urllib.parse import urljoin
import asyncio
import aiohttp
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(url, session, sem):
async with sem:
await fetch(url, session)
async def fetch_pages(url,pages,session):
tasks = []
sem = asyncio.Semaphore(LIMIT)
for page in range(pages+1):
task_url = urljoin(url,str(page))
task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
tasks.append(task)
await asyncio.gather(*tasks)
def leaderboard_crawler(date, entries=0, pages=1):
website = "https://www.thronebutt.com/archive/"
date_url = urljoin(website,date+"/")
entries_per_page = 30
number_of_entries = entries or pages * entries_per_page
full_pages, last_page = divmod(number_of_entries,30)
entry_list = [30 for x in range(full_pages)]
if last_page != 0:
entry_list.append(last_page)
loop = asyncio.get_event_loop()
with aiohttp.ClientSession() as session:
future = asyncio.ensure_future(fetch_pages(date_url,pages,session))
date_html = loop.run_until_complete(future)
return date_html
def weekly_leaderboard(week, year, entries=0, pages=1):
weekly_date = "{0:02d}{1}".format(week, year)
return leaderboard_crawler(weekly_date,entries,pages)
def daily_leaderboard(day, month, year, entries=0, pages=1):
daily_date = "{0:02d}{1:02d}{2}".format(day, month, year)
return leaderboard_crawler(daily_date, entries, pages)
我认为问题出在fetch_urls函数的asyncio.gather(*tasks)
部分。我无法弄清楚如何将其传递给leaderboard_crawler
。现在date_html
是无。我试过了return await asyncio.gather(*tasks)
,它返回了一个Nones数组。我还尝试将其包装在asyncio.ensure_future
中,然后将其传递给loop.run_until_complete
,但这似乎也无效。
答案 0 :(得分:2)
原因很简单,您在调用堆栈中缺少return
:
async def bound_fetch(url, session, sem):
async with sem:
# await fetch(url, session) # missing return
return await fetch(url, session) # this one is right
async def fetch_pages(url,pages,session):
tasks = []
sem = asyncio.Semaphore(LIMIT)
for page in range(pages+1):
task_url = urljoin(url,str(page))
task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
tasks.append(task)
# await asyncio.gather(*tasks) # missing return
return await asyncio.gather(*tasks) # this one is right.
工作示例如下:
from urllib.parse import urljoin
import asyncio
import aiohttp
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(url, session, sem):
async with sem:
return await fetch(url, session)
async def fetch_pages(url,pages,session):
tasks = []
sem = asyncio.Semaphore(5)
for page in range(pages+1):
task_url = urljoin(url,str(page))
task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
tasks.append(task)
return await asyncio.gather(*tasks)
def leaderboard_crawler(date, entries=0, pages=1):
website = "https://www.thronebutt.com/archive/"
date_url = urljoin(website,date+"/")
entries_per_page = 30
number_of_entries = entries or pages * entries_per_page
full_pages, last_page = divmod(number_of_entries,30)
entry_list = [30 for x in range(full_pages)]
if last_page != 0:
entry_list.append(last_page)
loop = asyncio.get_event_loop()
with aiohttp.ClientSession() as session:
future = asyncio.ensure_future(fetch_pages(date_url,pages,session))
date_html = loop.run_until_complete(future)
return date_html
def weekly_leaderboard(week, year, entries=0, pages=1):
weekly_date = "{0:02d}{1}".format(week, year)
return leaderboard_crawler(weekly_date,entries,pages)
def daily_leaderboard(day, month, year, entries=0, pages=1):
daily_date = "{0:02d}{1:02d}{2}".format(day, month, year)
return leaderboard_crawler(daily_date, entries, pages)