我想使用项目管道
在数据库中存储已删除的项目这是我的蜘蛛
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.utils.python import unicode_to_str
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.exceptions import ScrapyDeprecationWarning
from CollecteurImmobilier.items import CollecteurimmobilierItem
class AnnonceSpider(CrawlSpider):
name = "Annonce"
allowed_domains = ["tayara.tn"]
start_urls = ["http://www.tayara.tn/sousse/immobilier-%C3%A0_vendre"]
rules = (Rule(SgmlLinkExtractor(allow=('\\?o=\\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
sel = Selector(response)
DivAnnonces = sel.xpath('//div[@class="item"]')
items = []
for DivAnnonce in DivAnnonces:
item = CollecteurimmobilierItem()
item['link'] = DivAnnonce.xpath('.//h2/a/@href').extract()
titres = item['link']
items.append(item)
return items
这是我的管道
from datetime import datetime
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MySQLStorePipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
def process_item(self, item, spider):
# run db query in the thread pool
query = self.dbpool.runInteraction(self._conditional_insert, item, spider)
query.addErrback(self._handle_error, item, spider)
# at the end return the item in case of success or failure
query.addBoth(lambda _: item)
# return the deferred instead the item. This makes the engine to
# process next item (according to CONCURRENT_ITEMS setting) after this
# operation (deferred) has finished.
return query
def _conditional_insert(self, tx, item, spider):
tx.execute("""
SELECT * FROM AnnonceGratuit WHERE link = %s
""", (item['link']))
result = tx.fetchone()
if result:
print "Welcome to Python!"
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute("""
INSERT INTO AnnonceGratuit (link)
VALUES (%s)
""", (item['link'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def _handle_error(self, failure, item, spider):
"""Handle occurred on db interaction."""
# do nothing, just log
log.err(failure)
这是我的mysql.sql
DROP TABLE IF EXISTS AnnonceGratuit;
CREATE TABLE AnnonceGratuit (
link VARCHAR,
title VARCHAR
) DEFAULT CHARSET=utf8;
并在我的设置中添加此行
ITEM_PIPELINES = {
'CollecteurImmobilier.pipelines.MySQLStorePipeline': 300,
}
但是当我像这样运行我的蜘蛛时
scrapy crawl Annonce -o items.xml -t xml
我的终端没有错误
当蜘蛛正在运行时我看到这条消息"项目已经存储在db"
我成功输出了文件items.xml 但我的数据库中没有任何内容
请任何人帮助我 认为
答案 0 :(得分:0)
尝试使用db index来检测重复的
def _conditional_insert(self, tx, item, spider):
try:
tx.execute("""
INSERT INTO AnnonceGratuit (link)
VALUES (%s)
""", (item['link'])
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
except:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
如果在数据库中的链接上添加唯一索引约束
,这应该有效