MySQL不保存正在被抓取的数据

时间:2013-07-29 21:03:39

标签: python web-scraping scrapy web-crawler scrapyd

我使用Scrapy做了一个小项目。 问题是我的scrapy正在抓取页面并抓取数据。但它没有保存到我的数据库中。 我使用MySQL作为我的数据库。

我想我的pipelines.py文件中缺少一些东西

from scrapy import log
from twisted.enterprise import adbapi

import MySQLdb.cursors

# the required Pipeline settings.
class MySQLStorePipeline(object):

    def __init__(self, *args, **kwargs):
        #  db settings


        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                db='project2',
                user='root',
                passwd='',
                host='127.0.0.1',
                port='3306',                            
                cursorclass=MySQLdb.cursors.DictCursor,
                charset='utf8',
                use_unicode=True
            )

    def process_item(self, item, spider):
    # run db query in thread pool
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)
        return item


    def _conditional_insert(self, tx, item):
        if sites.get('//div[@class="abTbl "]'):
        #runs the condition
            insert_id = tx.execute(\
                "insert into crawlerapp_directory (Catogory, Bussiness_name, Description, Number, Web_url) "
                "values (%s, %s, %s, %s, %s)",
                (item['Catogory'][0],
                 item['Bussiness_name'][0],
                 item['Description'][0],
                 item['Number'][0],
                 item['Web_url'][0],
                 )
                )
#connection to the foreign key Adress.
            tx.execute(\
                "insert into crawlerapp_adress (directory_id, adress_name) "
                "values (%s, %s)",
                (insert_id,
                 item['adress_name'][0]
                 )
                )
#connection to the foreign key Photos.
            tx.execute(\
                "insert into crawlerapp_photos (directory_id, Photo_path, Photo_name) "
                "values (%s, %s, %s)",
                (insert_id,
                 item['Photo_path'][0],
                 item['Photo_name'][0]
                 )
                )
            log.msg("Item stored in db: %s" % item, level=log.DEBUG)
        def handle_error(self, e):
            log.err(e)

请指导我将已删除的数据保存在我的数据库中。

1 个答案:

答案 0 :(得分:0)

试试这个,它对我有用

import MySQLdb
from scrapy import log


class MySpiderPipeline(object):
    def __init__(self):
        self.conn = MySQLdb.connect(host_name,
                                    user_name,
                                    password,
                                    db_name, 
                                    charset="utf8", use_unicode=True)    
        self.cursor = self.conn.cursor()

    def open_spider(spider):        
        pass
    def close_spider(self,spider):
        self.conn.close()

    def process_item(self, item, spider):
        try:
            sql = """INSERT INTO tutorial (`title`,`link`,`desc`,`last_updated`)  
                    VALUES ('%s', '%s','%s','%s')"""  %(item['title'][0].replace("'","`"),
                        item['link'][0].replace("'","`"),
                        item['desc'][0].replace("'","`"),
                        item['last_updated'].replace("'","`")
                        )
            self.cursor.execute(sql)            
            self.conn.commit()


        except MySQLdb.Error, e:
            print '!!!!!!!!!!!!!!!!!!DB Write failure!!!!!!!!!!!!'
            print "Error %d: %s" % (e.args[0], e.args[1])
            log.msg("Error %d: %s" % (e.args[0], e.args[1]), level=log.CRITICAL)
        return item