您好我正在使用python scrape库来创建蜘蛛并从网站中提取数据。在我的管道中,我使用Flask-SQLAlchemy配置蜘蛛,以便将已删除的数据添加到SQLite表中。我试图弄清楚如何防止蜘蛛向表中添加重复数据,因此我查询数据库以查看表是否已包含与蜘蛛将要添加到表中的元素相同的元素。如果是这样,我不希望蜘蛛添加数据,只是转移到下一个元素。这是我到目前为止的代码:
from flask import Flask, render_template, redirect, request, url_for
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.orm import sessionmaker
import os
import sys
from sqlalchemy import Column, ForeignKey, Integer, String, DateTime, union
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
app = Flask(__name__)
# Flask-SQLAlchemy
app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False
app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite://draft.db"
app.config["SQLALCHEMY_ECHO"] = True
db = SQLAlchemy(app)
def db_connect():
return create_engine('sqlite:///draft.db')
Base = declarative_base()
class Events(Base):
__tablename__ = 'events'
id = Column(Integer, primary_key=True)
title = Column(String(250), nullable=False)
location = Column(String(250), nullable=False)
date = Column(String(250), nullable=False)
url = Column(String(250), nullable=False)
engine = create_engine('sqlite:///draft.db')
def create_events_table(engine):
Base.metadata.create_all(engine)
class GeneralPipeline(object):
def __init__(self):
engine = db_connect()
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
session = self.Session()
event = Events(**item)
duplicates = union(session.query(Events).filter_by(title=item["title"]), session.query(Events).filter_by(date=item["date"]))
if duplicates:
session.close()
else:
try:
session.add(event)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
这将返回错误引发TypeError(“此子句的布尔值未定义”),我假设它引用了if duplicates:语句但是我不确定为什么这不起作用。我只是想检查是否存在任何重复项,如果存在,蜘蛛不应该将该数据元素添加到表中。关于如何使这个系统工作的任何指针?
以下是更多的请求堆栈跟踪:
TypeError: Boolean value of this clause is not defined
2016-12-06 21:02:34 [scrapy] ERROR: Error processing {'location': None, 'date': datetime.datetime(2017, 1, 21, 21, 0), 'url': 'http://www.trumba.com/events-calendar/ma/boston/harvard-events/harvard-event-calendar/harvard-activities/gazette', 'title': 'Varsity Show'}
Traceback (most recent call last):
File "/usr/local/lib/python3.4/dist-packages/twisted/internet/defer.py", line 649, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/ubuntu/workspace/project/draft/draft/draft/pipelines.py", line 58, in process_item
if duplicates:
File "/usr/local/lib/python3.4/dist-packages/sqlalchemy/sql/elements.py", line 481, in __bool__
raise TypeError("Boolean value of this clause is not defined")
TypeError: Boolean value of this clause is not defined
2016-12-06 21:02:34 [scrapy] INFO: Closing spider (finished)
2016-12-06 21:02:34 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 524,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 352524,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 12, 7, 2, 2, 34, 379384),
'log_count/DEBUG': 3,
'log_count/ERROR': 201,
'log_count/INFO': 7,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 12, 7, 2, 2, 32, 3552)}
2016-12-06 21:02:34 [scrapy] INFO: Spider closed (finished)