使用SCRAPY在MySQL数据库中存储已删除的数据

时间:2015-06-04 00:39:29

标签: mysql scrapy store items

我是新来的,这是我第一次使用scrapy而且我真的需要帮助。我知道以前曾经问过,我确实尝试了很多解决方案,但都没有。

我的管道文件:

import sys
import MySQLdb
import hashlib
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import DropItem
from scrapy.http import Request
from projetpfe.items import ProjetpfeItem

class MySQLStorePipeline(object):
    def __init__(self):
        try:
            self.conn= MySQLdb.connect(user='root', passwd='root123', host='localhost', db='pressebam', use_unicode=True, charset='utf8')
            self.cursor = self.conn.cursor()
            self.cursor.execute("CREATE TABLE IF NOT EXISTS scrapeddata2( idscrapedData INT NOT NULL AUTO_INCREMENT PRIMARY KEY, nomOrganePresse VARCHAR(200), titreArticle VARCHAR(200), url VARCHAR(200), nomJournaliste VARCHAR(200), jour VARCHAR(100), annee VARCHAR(100), categorie VARCHAR(100), contenuArticle VARCHAR(5000), lienImage VARCHAR(200)) ")
            self.conn.commit()
        except (AttributeError, MySQLdb.OperationalError), e:
            raise e

    def process_item(self, item, spider):    
        try:
            self.cursor.execute( "INSERT INTO scrapeddata2 ( nomOrganePresse, titreArticle, url, jour, contenuArticle, lienImage) VALUES (%s, %s, %s,%s,%s, %s)", 
            (item['OrganePresse'], 
            item['Titre'], 
            item['URL'], 
            item['Jour'], 
            item['Contenu'], 
            item['LienImage'] ))       
            self.conn.commit()

        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

            return item

这是我的蜘蛛文件

import urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from projetpfe.items import ProjetpfeItem

class ProjetpfeSpider(CrawlSpider):
    name = 'telquel'
    start_urls = ['http://telquel.ma'] # urls from which the spider will start crawling
    rules = [Rule(SgmlLinkExtractor(allow=[r'page/\d+']), follow=True), 
        # r'page/\d+' : regular expression for http://telquelpage/X URLs
        Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}/\d{2}/\w+']), callback='parse_telquel')]
        # r'\d{4}/\d{2}/\w+' : regular expression for http://telquel.ma/YYYY/MM/title URLs

    def parse_telquel(self, response):
        hxs = HtmlXPathSelector(response)
        item = ProjetpfeItem()

        # XPath selector for title

        item['Titre'] = hxs.select("//h1[@class='article-title']/text()").extract()
        item['LienImage'] = hxs.select("//div[@class='main-article-content']//img[@class='setborder']/@src").extract()
        item['OrganePresse'] = hxs.select("//img[@class='logo']/@alt").extract()
        item['Jour'] = hxs.select("//div[@class='calendar-date']/text()").extract()
        item['Contenu'] = hxs.select("//div[@class='shortcode-content']").extract()
        item['URL'] = hxs.select("/html/head/link[5]/@href").extract()
        return item

这是设置文件

BOT_NAME = 'projetpfe'

SPIDER_MODULES = ['projetpfe.spiders']
NEWSPIDER_MODULE = 'projetpfe.spiders'

ITEM_PIPELINES = {'projetpfe.pipelines.MySQLStorePipeline' : 300}

最后是我的物品

from scrapy.item import Item, Field


class ProjetpfeItem(Item):
   OrganePresse = Field()
   Titre = Field()
   Journaliste = Field()
   Jour = Field()
   Annee = Field()
   Categorie = Field()
   Contenu = Field()
   LienImage = Field()
   URL = Field()

所以蜘蛛工作正常但是nada存储在数据库中。 HELP !!!

0 个答案:

没有答案