我想要将数据废弃到mysql中,但它不起作用。我不知道这个错误在哪里。我试过了,但它不起作用。 请首先加载schema()。 你能帮我吗?
Mysql代码
DROP TABLE IF EXISTS website;
CREATE TABLE website (
guid CHAR(32) PRIMARY KEY,
name TEXT,
url TEXT,
updated DATETIME
) DEFAULT CHARSET=utf8;
pipeline.py
# -*- coding: utf-8 -*-
from datetime import datetime
import MySQLdb
from hashlib import md5
from scrapy import log
from scrapy.exceptions import DropItem
from twisted.enterprise import adbapi
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FilterWordsPipeline(object):
"""A pipeline for filtering out items which contain certain words in their
description"""
'''
# put all words in lowercase
words_to_filter = ['politics', 'religion']
def process_item(self, item, spider):
for word in self.words_to_filter:
desc = item.get('description') or ''
if word in desc.lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
'''
class RequiredFieldsPipeline(object):
"""A pipeline to ensure the item have the required fields."""
required_fields = ('name', 'url')
def process_item(self, item, spider):
for field in self.required_fields:
if not item.get(field):
raise DropItem("Field '%s' missing: %r" % (field, item))
return item
class MySQLStorePipeline(object):
"""A pipeline to store the item in a MySQL database.
This implementation uses Twisted's asynchronous database API.
"""
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
use_unicode=True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
def process_item(self, item, spider):
# run db query in the thread pool
d = self.dbpool.runInteraction(self._do_upsert, item, spider)
d.addErrback(self._handle_error, item, spider)
# at the end return the item in case of success or failure
d.addBoth(lambda _: item)
# return the deferred instead the item. This makes the engine to
# process next item (according to CONCURRENT_ITEMS setting) after this
# operation (deferred) has finished.
return d
def _do_upsert(self, conn, item, spider):
"""Perform an insert or update."""
guid = self._get_guid(item)
now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
conn.execute("""SELECT EXISTS(
SELECT 1 FROM website WHERE guid = %s
)""", (guid, ))
ret = conn.fetchone()[0]
if ret:
conn.execute("""
UPDATE website
SET name=%s, url=%s, updated=%s
WHERE guid=%s
""", (item['name'], item['url'], now, guid))
spider.log("Item updated in db: %s %r" % (guid, item))
else:
conn.execute("""
INSERT INTO website (guid, name, url, updated)
VALUES (%s, %s, %s, %s)
""", (guid, item['name'], item['url'], now))
spider.log("Item stored in db: %s %r" % (guid, item))
def _handle_error(self, failure, item, spider):
"""Handle occurred on db interaction."""
# do nothing, just log
log.err(failure)
def _get_guid(self, item):
"""Generates an unique identifier for a given item."""
# hash based solely in the url field
return md5(item['url']).hexdigest()
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy.contrib.loader.processor import TakeFirst
class GepomysqlItem(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
#class WebsiteLoader(XPathItemLoader):
# default_item_class = Website
#default_output_processor = TakeFirst()
gipo.py
# -*- coding: utf-8 -*-
import scrapy
import hashlib
from gepomysql.items import GepomysqlItem
from scrapy.loader import ItemLoader
class GepoSpider(scrapy.Spider):
name = "gipo"
allowed_domains = ["gepris.dfg.de/gepris/"]
start_urls = ['http://gepris.dfg.de/gepris/OCTOPUS?beginOfFunding=&bewilligungsStatus=&context=projekt&continentId=%23&countryKey=%23%23%23&einrichtungsart=-1&fachlicheZuordnung=%23&findButton=historyCall&gefoerdertIn=&hitsPerPage=50&index=0&nurProjekteMitAB=false&oldContinentId=%23&oldCountryId=%23%23%23&oldSubContinentId=%23%23&oldpeo=%23&peo=%23&subContinentId=%23%23&task=doSearchExtended&teilprojekte=true&zk_transferprojekt=false',]
def parse(self, response):
for site in response.xpath("//*[@id='liste']/div[contains(@class,'eintrag')]"):
il = ItemLoader(item=GepomysqlItem(), selector=site)
il.add_xpath('url', './/div[@class="results"]/h2/a/text()')
il.add_xpath('name', './/div[span="Sprecher"]/span[2]/text()')
#il.add_xpath('description', 'text()', re='-\s([^\n]*?)\\n')
yield il.load_item()
settings.py
# Scrapy settings for dirbot project
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = [
'dirbot.pipelines.RequiredFieldsPipeline',
'dirbot.pipelines.FilterWordsPipeline',
'dirbot.pipelines.MySQLStorePipeline',
]
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'dirbot'
MYSQL_USER = 'root'
MYSQL_PASSWD = ''