我正在尝试使用我的蜘蛛的__init__
方法写入日志,但是我似乎无法使其工作,尽管它在解析方法中工作正常。
init方法中对self.log的调用由方法'get_urls_from_file'完成。我知道正在调用该方法,因为我在stdout中看到了print语句,所以我想知道是否有人能指出我正确的方向。我正在使用scrapy v0.18。谢谢!
我的代码如下:
from scrapy.spider import BaseSpider
from scrapy_redis import connection
from importlib import import_module
from scrapy import log
from scrapy.settings import CrawlerSettings
class StressS(BaseSpider):
name = 'stress_s_spider'
allowed_domains = ['www.example.com']
def __init__(self, url_file=None, *args, **kwargs):
super(StressS, self).__init__(*args, **kwargs)
settings = CrawlerSettings(import_module('stress_test.settings'))
if url_file:
self.url_file = url_file
else:
self.url_file = settings.get('URL_FILE')
self.start_urls = self.get_urls_from_file(self.url_file)
self.server = connection.from_settings(settings)
self.count_key = settings.get('ITEM_COUNT')
def parse(self, response):
self.log('Processed: %s, status code: %s' % (response.url, response.status), level = log.INFO)
self.server.incr(self.count_key)
def get_urls_from_file(self, fn):
urls = []
if fn:
try:
with open(fn, 'r') as f:
urls = [line.strip() for line in f.readlines()]
except IOError:
msg = 'File %s could not be opened' % fn
print msg
self.log(msg, level = log.ERROR)
return urls
答案 0 :(得分:1)
您可以覆盖start_requests
方法:
# Default value for the argument in case it's missing.
url_file = None
def start_requests(self):
settings = self.crawler.settings
url_file = self.url_file if self.url_file else settings['URL_FILE']
# set up server and count_key ...
# finally yield the requests
for url in self.get_urls_from_file(url_file):
yield Request(url, dont_filter=True)
您也可以覆盖方法set_crawler
并在那里设置属性:
def set_crawler(self, crawler):
super(MySpider, self).set_crawler(crawler)
settings = crawler.settings
# set up start_urls ...
答案 1 :(得分:0)
Scrapy 0.22
看起来不可能。