我在python + tornado上有python类,就像爬虫一样。我在同一个站点上有很多链接,我需要得到所有这些链接到我的数据库的响应。 这很困难:我无法理解如何捕获URL,错误(超时或运行时间延迟)。 我知道如何使用newbie-code解决这个问题(我在python上只有一周的代码) - 比较输入链接和输出列表,但我想做正确的方法。 你能告诉我我该怎么办?
import time
import requests
import json
from tornado import gen, ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.queues import Queue
class Scraper():
def __init__(self, source='', destinations=None, transform=None, headers={ }, max_clients=20, maxsize=20, connect_timeout=600, request_timeout=600 ):
"""Instantiate a tornado async http client to do many URL requests"""
if None in destinations:
sys.stderr.write('You must pass both collection of URLS and a transform function')
raise SystemExit
self.max_clients = max_clients
self.maxsize = maxsize
self.connect_timeout = connect_timeout
self.request_timeout = request_timeout
# AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient", max_clients=50)
AsyncHTTPClient.configure("tornado.simple_httpclient.SimpleAsyncHTTPClient", max_clients=self.max_clients)
self.headers = headers
self.http_client = AsyncHTTPClient()
self.queue = Queue(maxsize=20)
self.source = source
self.destinations = destinations
self.transform = transform
self.read(self.destinations)
self.get(self.transform, self.headers, self.connect_timeout, self.request_timeout)
self.loop = ioloop.IOLoop.current()
self.join_future = self.queue.join()
def done(future):
self.loop.stop()
self.join_future.add_done_callback(done)
self.loop.start()
@gen.coroutine
def read(self, destinations):
for url in destinations:
yield self.queue.put(url)
@gen.coroutine
def get(self, transform, headers, connect_timeout, request_timeout):
while True:
url = yield self.queue.get()
request = HTTPRequest(url,
connect_timeout=connect_timeout,
request_timeout=request_timeout,
method="GET",
headers = headers
)
future = self.http_client.fetch(request)
def done_callback(future):
self.queue.task_done()
body = future.result().body
transform(body)
future.add_done_callback(done_callback)
def transform_data(body, url=''):
#SOMECODE
a = ['link1', 'link2']
scraper = Scraper(destinations=a, transform=transform_data)
答案 0 :(得分:0)
在协程中,你可以“产生”未来。协程暂停,直到将来解决为结果或异常:
try:
result = yield self.http_client.fetch(request)
except Exception as exc:
print("Failure!: %s" % exc)
else:
self.queue.task_done()
body = result.body
transform(body)