scrapy将项目加载到mysql服务器

时间:2016-05-18 21:00:39

标签: python mysql scrapy

我想要将数据废弃到mysql中,但它不起作用。我不知道这个错误在哪里。我试过了,但它不起作用。 请首先加载schema()。 你能帮我吗?

Mysql代码

DROP TABLE IF EXISTS website;
CREATE TABLE website (
  guid CHAR(32) PRIMARY KEY,
  name TEXT,
  url TEXT,
  updated DATETIME
) DEFAULT CHARSET=utf8;

pipeline.py

# -*- coding: utf-8 -*-
from datetime import datetime
import MySQLdb
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

class FilterWordsPipeline(object):
    """A pipeline for filtering out items which contain certain words in their
    description"""
'''
    # put all words in lowercase
    words_to_filter = ['politics', 'religion']

    def process_item(self, item, spider):
        for word in self.words_to_filter:
            desc = item.get('description') or ''
            if word in desc.lower():
                raise DropItem("Contains forbidden word: %s" % word)
        else:
            return item

'''

class RequiredFieldsPipeline(object):
    """A pipeline to ensure the item have the required fields."""

    required_fields = ('name', 'url')

    def process_item(self, item, spider):
        for field in self.required_fields:
            if not item.get(field):
                raise DropItem("Field '%s' missing: %r" % (field, item))
        return item


class MySQLStorePipeline(object):
    """A pipeline to store the item in a MySQL database.

    This implementation uses Twisted's asynchronous database API.
    """

    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):
        dbargs = dict(
            host=settings['MYSQL_HOST'],
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',
            use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
        return cls(dbpool)

    def process_item(self, item, spider):
        # run db query in the thread pool
        d = self.dbpool.runInteraction(self._do_upsert, item, spider)
        d.addErrback(self._handle_error, item, spider)
        # at the end return the item in case of success or failure
        d.addBoth(lambda _: item)
        # return the deferred instead the item. This makes the engine to
        # process next item (according to CONCURRENT_ITEMS setting) after this
        # operation (deferred) has finished.
        return d

    def _do_upsert(self, conn, item, spider):
        """Perform an insert or update."""
        guid = self._get_guid(item)
        now = datetime.utcnow().replace(microsecond=0).isoformat(' ')

        conn.execute("""SELECT EXISTS(
            SELECT 1 FROM website WHERE guid = %s
        )""", (guid, ))
        ret = conn.fetchone()[0]

        if ret:
            conn.execute("""
                UPDATE website
                SET name=%s, url=%s, updated=%s
                WHERE guid=%s
            """, (item['name'], item['url'], now, guid))
            spider.log("Item updated in db: %s %r" % (guid, item))
        else:
            conn.execute("""
                INSERT INTO website (guid, name, url, updated)
                VALUES (%s, %s, %s, %s)
            """, (guid, item['name'], item['url'], now))
            spider.log("Item stored in db: %s %r" % (guid, item))

    def _handle_error(self, failure, item, spider):
        """Handle occurred on db interaction."""
        # do nothing, just log
        log.err(failure)

    def _get_guid(self, item):
        """Generates an unique identifier for a given item."""
        # hash based solely in the url field
        return md5(item['url']).hexdigest()

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy.contrib.loader.processor import TakeFirst

class GepomysqlItem(scrapy.Item):
    name = scrapy.Field()
    url = scrapy.Field()
#class WebsiteLoader(XPathItemLoader):
#    default_item_class = Website
    #default_output_processor = TakeFirst()

gipo.py

# -*- coding: utf-8 -*-
import scrapy
import hashlib
from gepomysql.items import GepomysqlItem
from scrapy.loader import ItemLoader

class GepoSpider(scrapy.Spider):
    name = "gipo"
    allowed_domains = ["gepris.dfg.de/gepris/"]
    start_urls = ['http://gepris.dfg.de/gepris/OCTOPUS?beginOfFunding=&bewilligungsStatus=&context=projekt&continentId=%23&countryKey=%23%23%23&einrichtungsart=-1&fachlicheZuordnung=%23&findButton=historyCall&gefoerdertIn=&hitsPerPage=50&index=0&nurProjekteMitAB=false&oldContinentId=%23&oldCountryId=%23%23%23&oldSubContinentId=%23%23&oldpeo=%23&peo=%23&subContinentId=%23%23&task=doSearchExtended&teilprojekte=true&zk_transferprojekt=false',]

    def parse(self, response):
        for site in response.xpath("//*[@id='liste']/div[contains(@class,'eintrag')]"):
            il = ItemLoader(item=GepomysqlItem(), selector=site)
            il.add_xpath('url', './/div[@class="results"]/h2/a/text()')
            il.add_xpath('name', './/div[span="Sprecher"]/span[2]/text()')
            #il.add_xpath('description', 'text()', re='-\s([^\n]*?)\\n')
            yield il.load_item()

settings.py

# Scrapy settings for dirbot project

SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'

ITEM_PIPELINES = [
    'dirbot.pipelines.RequiredFieldsPipeline',
    'dirbot.pipelines.FilterWordsPipeline',
    'dirbot.pipelines.MySQLStorePipeline',
]

MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'dirbot'
MYSQL_USER = 'root'
MYSQL_PASSWD = ''

0 个答案:

没有答案