我想将scrapy与couchbase结合使用来存储/检索数据。
为了存储和检索我的数据,我对采用的解决方案感到困惑:
我的意思是:
Class CouchbasePipeline(object):
def __init__(self):
## init client here using settings
def process_item(self, item, spider):
## store item here
类似的东西:
Class CouchBaseCacheStorage(object):
def __init__(self, settings):
## init client here using settings
def get_response(self, spider, request):
pass
def save_response(self, spider, request, response):
pass
或许我应该同时实施两者? (管理缓存/数据库)。
我真的很困惑,特别是我是python / couchebase / scrapy的新手?我的问题不是关于做事的最佳实施/工具,而是更多关于做这种scrapy的标准方法,因为我无法在源文档或网络上找到它。
提前感谢您的帮助。
答案 0 :(得分:1)
这是我实施的解决方案:
代码:
from scrapy.conf import settings
from couchbase.exceptions import CouchbaseError
from couchbase import Couchbase
class CouchbaseStore(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self,settings):
self._server = settings.get('COUCHBASE_SERVER')
self._bucket = settings.get('COUCHBASE_BUCKET')
self._password = settings.get('COUCHBASE_PASSWORD')
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
def process_item(self, item, spider):
data = {}
for key in item.keys():
if isinstance(item[key], datetime):
data[key] = item[key].isoformat()
else:
data[key] = item[key]
## I assume item have a unique time field
key = "{0}".format(item['time'].isoformat())
self.cb.set(key,data)
log.msg("Item with key % s stored in bucket %s/ node %s" %
(key, settings['COUCHBASE_BUCKET'],
settings['COUCHBASE_SERVER']),
level=log.INFO, spider=spider)
return item
def spider_opened(self, spider):
self._server = settings['COUCHBASE_SERVER']
self._bucket = settings['COUCHBASE_BUCKET']
self._password = settings['COUCHBASE_PASSWORD']
try:
self.cb = Couchbase.connect(self._bucket)
except CouchbaseError:
log.msg('Connection problem to bucket %s'%self._bucket,
log.ERROR)
log.msg("CouchbaseStore.spider_opened called",
level=log.DEBUG)
def spider_closed(self, spider):
self.cb._close()
log.msg("CouchbaseStore.spider_closed called",
level=log.DEBUG)
答案 1 :(得分:1)
@agstudy发布回答后的代码建议。
__init__
中,因此不妨在以后使用它们,from scrapy.conf import settings
_port
选项self.cb
更改为self.couchbase
,不要混淆"回调" 见下文:
from scrapy import signals
from couchbase.exceptions import CouchbaseError
from couchbase import Couchbase
class CouchbaseStore(object):
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def __init__(self, settings):
self._server = settings.get('COUCHBASE_SERVER')
self._port = settings.get('COUCHBASE_PORT', 8091)
self._bucket = settings.get('COUCHBASE_BUCKET')
self._password = settings.get('COUCHBASE_PASSWORD')
def process_item(self, item, spider):
data = {}
for key in item.keys():
if isinstance(item[key], datetime):
data[key] = item[key].isoformat()
else:
data[key] = item[key]
## I assume item have a unique time field
key = "{0}".format(item['time'].isoformat())
self.couchbase.set(key, data)
log.msg("Item with key % s stored in bucket %s/ node %s" %
(key, self._bucket, self._server),
level=log.INFO, spider=spider)
return item
def spider_opened(self, spider):
try:
self.couchbase = Couchbase.connect(bucket=self._bucket,
host=self._server,
post=self._port,
password=self._password)
except CouchbaseError:
log.msg('Connection problem to bucket %s'% self._bucket,
log.ERROR)
log.msg("CouchbaseStore.spider_opened called", level=log.DEBUG)
def spider_closed(self, spider):
self.couchbase._close()
log.msg("CouchbaseStore.spider_closed called", level=log.DEBUG)
答案 2 :(得分:0)
存储数据的标准方法是使用item pipeline,但要检索数据,我认为您应该使用downloader middleware。为清楚起见,请检查Scrapy architecture overview,尤其是此图表: