我是python的新手,我尝试通过scrapy编写一个爬虫。我已经按照doc scrapy删除了重复的抓取项目,但它没有工作。 selft.ids_seen返回[]。这是我的管道:
from sqlalchemy.orm import sessionmaker
from models import Products, db_connect, create_products_table
import logging
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
logging.warning(self.ids_seen)
def process_item(self, item, spider):
if item['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['id'])
logging.warning(item['id'])
return item
class TikiPipeline(object):
def __init__(self):
engine = db_connect()
create_products_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
product = Products(**item)
try:
session.add(product)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
我的setting.py
ITEM_PIPELINES = {
'tiki.pipelines.DuplicatesPipeline': 100,
'tiki.pipelines.TikiPipeline': 300,
}
有谁能解释一下set()方法会返回什么?非常感谢你。