我正在尝试使用scrapy + MongoDB(PyMongo)抓取Spider,我收到错误:name必须是basetring的一个实例。
由于我的Spider正在工作,因为它正在将数据写入json,我猜错误是在我的新管道中,这里是源代码:
import pymongo
from scrapy import log
from scrapy.conf import settings
from scrapy.exceptions import DropItem
class MongoDBPipeline(object):
def __init__(self):
self.server = settings['localhost']
self.port = settings['27017']
self.db = settings['IngressoRapido']
self.col = settings['Shows']
connection = pymongo.Connection(self.server, self.port)
db = connection[self.db]
self.collection = db[self.col]
def process_item(self, item, spider):
err_msg = ''
for banda, local in item.items():
if not local :
err_msg += 'Faltando local %s da banda %s\n' % (banda, item['banda'])
if err_msg:
raise DropItem(err_msg)
self.collection.insert(dict(item))
log.msg('Item written to MongoDB database %s/%s' % (self.db, self.col),
level=log.DEBUG, spider=spider)
return item
答案 0 :(得分:3)
似乎您打算连接到localhost端口27017,而是使用这些值作为键从设置中获取值。你的意思是这个吗?
def __init__(self):
self.server = 'localhost'
self.port = '27017'
self.db = 'IngressoRapido'
self.col = 'Shows'
答案 1 :(得分:0)
以下代码可以正常运行并正确处理清理资源。可以使用from_crawler方法提取设置。
class MongoPipeline(object):
'''
Saves the scraped item to mongodb.
'''
def __init__(self, mongo_server, mongo_port, mongo_db, mongo_collection):
self.mongo_server = mongo_server
self.mongo_port = mongo_port
self.mongo_db = mongo_db
self.mongo_collection = mongo_collection
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_server=crawler.settings.get('MONGODB_SERVER'),
mongo_port=crawler.settings.get('MONGODB_PORT'),
mongo_db=crawler.settings.get('MONGODB_DB'),
mongo_collection=crawler.settings.get('MONGODB_COLLECTION'),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_server, self.mongo_port)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.mongo_collection].insert(dict(item))
return item
注意:请在piplines.py中导入pymongo。
请查看官方文档。 http://doc.scrapy.org/en/latest/topics/item-pipeline.html#write-items-to-mongodb