如何使用物品管道为Scrapy存储数据库中的已删除项目?

时间:2014-04-22 18:52:44

标签: python mysql scrapy pipeline

我想使用项目管道

在数据库中存储已删除的项目

这是我的蜘蛛

from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.utils.python import unicode_to_str
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.exceptions import ScrapyDeprecationWarning
from CollecteurImmobilier.items import CollecteurimmobilierItem

class AnnonceSpider(CrawlSpider):
name = "Annonce"
allowed_domains = ["tayara.tn"]
start_urls = ["http://www.tayara.tn/sousse/immobilier-%C3%A0_vendre"]
rules = (Rule(SgmlLinkExtractor(allow=('\\?o=\\d')),'parse_start_url',follow=True),)

def parse_start_url(self, response):
    sel = Selector(response)
    DivAnnonces = sel.xpath('//div[@class="item"]')
    items = []
    for DivAnnonce in DivAnnonces:
        item = CollecteurimmobilierItem()
        item['link'] = DivAnnonce.xpath('.//h2/a/@href').extract()
        titres = item['link']
        items.append(item)
    return items

这是我的管道

from datetime import datetime
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request

class MySQLStorePipeline(object):

def __init__(self, dbpool):
    self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):
    dbargs = dict(
        host=settings['MYSQL_HOST'],
        db=settings['MYSQL_DBNAME'],
        user=settings['MYSQL_USER'],
        passwd=settings['MYSQL_PASSWD'],
        charset='utf8',
        use_unicode=True,
    )
    dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
    return cls(dbpool)

def process_item(self, item, spider):
    # run db query in the thread pool
    query = self.dbpool.runInteraction(self._conditional_insert, item, spider)
    query.addErrback(self._handle_error, item, spider)
    # at the end return the item in case of success or failure
    query.addBoth(lambda _: item)
    # return the deferred instead the item. This makes the engine to
    # process next item (according to CONCURRENT_ITEMS setting) after this
    # operation (deferred) has finished.
    return query

def _conditional_insert(self, tx, item, spider):

    tx.execute("""
        SELECT * FROM AnnonceGratuit WHERE link = %s
    """, (item['link']))
    result = tx.fetchone()
    if result:
        print "Welcome to Python!"
        log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
    else:

        tx.execute("""
            INSERT INTO AnnonceGratuit (link)
            VALUES (%s)
        """, (item['link'])
        )

        log.msg("Item stored in db: %s" % item, level=log.DEBUG)

def _handle_error(self, failure, item, spider):
    """Handle occurred on db interaction."""
    # do nothing, just log
    log.err(failure)

这是我的mysql.sql

DROP TABLE IF EXISTS AnnonceGratuit;
CREATE TABLE AnnonceGratuit (
link VARCHAR,
title VARCHAR
) DEFAULT CHARSET=utf8;

并在我的设置中添加此行

ITEM_PIPELINES = {
'CollecteurImmobilier.pipelines.MySQLStorePipeline': 300,
}

但是当我像这样运行我的蜘蛛时

scrapy crawl Annonce -o items.xml -t xml

我的终端没有错误

当蜘蛛正在运行时我看到这条消息"项目已经存储在db"

我成功输出了文件items.xml 但我的数据库中没有任何内容

请任何人帮助我 认为

1 个答案:

答案 0 :(得分:0)

尝试使用db index来检测重复的

def _conditional_insert(self, tx, item, spider):

    try:

        tx.execute("""
            INSERT INTO AnnonceGratuit (link)
            VALUES (%s)
        """, (item['link'])
        )

        log.msg("Item stored in db: %s" % item, level=log.DEBUG)
    except:
        log.msg("Item already stored in db: %s" % item, level=log.DEBUG)

如果在数据库中的链接上添加唯一索引约束

,这应该有效