使用Twisted将Scrapy输出保存到MySQL数据库?

时间:2016-12-21 23:54:37

标签: python python-2.7 python-3.x scrapy

我想将the tutorial spider的输出保存到MySQL但是要努力理解为什么我在项目管道中收到以下错误:

ImportError: No module named MySQLdb
Exception AttributeError: "'QuotePipeline' object has no attribute 'dbpool'"

使用Ubuntu 16.04和Python 3.5.2。

如果有人可以建议我在哪里出错,那么非常感谢,非常感谢!

以下是相关代码:

/spiders/quotes.py

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }

/items.py

import scrapy


class QuoteItem(scrapy.Item):
    # define the fields for your item here like:
    text = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()
    pass

/pipelines.py

from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings

settings = get_project_settings()


class QuotePipeline(object):
    # The table you items.QuoteItem class map to, my table is named quotes
    insert_sql = """insert into quotes (%s) values ( %s )"""

    def __init__(self):
        dbargs = settings.get('DB_CONNECT')
        db_server = settings.get('DB_SERVER')
        dbpool = adbapi.ConnectionPool(db_server, **dbargs)
        self.dbpool = dbpool

    def __del__(self):
        self.dbpool.close()

    def process_item(self, item, spider):
        self.insert_data(item, self.insert_sql)
        return item

    def insert_data(self, item, insert):
        keys = item.keys()
        fields = u','.join(keys)
        qm = u','.join([u'%s'] * len(keys))
        sql = insert % (fields, qm)
        data = [item[k] for k in keys]
        return self.dbpool.runOperation(sql, data)

/settings.py

BOT_NAME = 'tutorial'

SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'

DB_SERVER = 'MySQLdb'
DB_CONNECT = {
    'db': 'scrapy',
    'user': 'username',
    'passwd': 'password',
    'host': 'ip.of.the.server',
    'charset': 'utf8',
    'use_unicode': True,
}

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'tutorial.pipelines.QuotePipeline': 500,
}

SQL架构

CREATE DATABASE `scrapy` /*!40100 DEFAULT CHARACTER SET utf8mb4 */
CREATE TABLE `quotes` (
 `id` mediumint(6) NOT NULL AUTO_INCREMENT,
 `text` text NOT NULL,
 `author` varchar(255) NOT NULL,
 `tags` varchar(255) NOT NULL,
 PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4

1 个答案:

答案 0 :(得分:1)

问题解决了:

sudo apt-get install python-mysqldb