scrapy mysql管道错误

时间:2012-05-03 14:23:29

标签: python mysql scrapy

我正在使用scrapy,我正在尝试将抓取的数据从蜘蛛保存到MySql数据库。我正在使用管道实现这一点,但没有运气。这是我的管道代码:

from scrapy import log
from scrapy.core.exceptions import DropItem
from twisted.enterprise import adbapi

import time
import MySQLdb.cursors

class FilterWordsPipeline(object):
"""A pipeline for filtering out items which contain certain words in their
description"""

# put all words in lowercase
words_to_filter = ['politics', 'religion']

def process_item(self, spider, item):
    print spider
    for word in self.words_to_filter:
        if word in unicode(item['description']).lower():
            raise DropItem("Contains forbidden word: %s" % word)
    else:
        return item

class MySQLStorePipeline(object):

def __init__(self):
    # @@@ hardcoded db settings
    # TODO: make settings configurable through settings
    self.dbpool = adbapi.ConnectionPool('adress_to_db',
            db='my_db',
            user='my_user',
            passwd='my_pw',
            cursorclass=MySQLdb.cursors.DictCursor,
            charset='utf8',
            use_unicode=True
        )

def process_item(self, spider, item):
    # run db query in thread pool
    query = self.dbpool.runInteraction(self._conditional_insert, item)
    query.addErrback(self.handle_error)

    return item

def _conditional_insert(self, tx, item):
    # create record if doesn't exist. 
    # all this block run on it's own thread
    tx.execute("select * from scrapytest where link = %s", (item['link'][0], ))
    result = tx.fetchone()
    if result:
        log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
    else:
        tx.execute(\
            "insert into scrapytest (title, link, desc) "
            "values (%s, %s, %s)",
            (item['title'][0],
             item['link'][0],
             item['desc'][0]
        )
        log.msg("Item stored in db: %s" % item, level=log.DEBUG)

def handle_error(self, e):
    log.err(e) 

这是我收到的错误消息:

SyntaxError: invalid syntax
PS C:\Python27\testscrapy\tutorial> scrapy crawl dmoz
2012-05-03 16:03:11+0200 [scrapy] INFO: Scrapy 0.14.3 started (bot: tutorial)
2012-05-03 16:03:12+0200 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole,                        
CloseSpider, WebService, CoreStats
 , SpiderState
2012-05-03 16:03:12+0200 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware,                           
DownloadTimeoutMiddleware,
UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware,     
CookiesMiddleware, HttpCompressionMi
ddleware, ChunkedTransferMiddleware, DownloaderStats
2012-05-03 16:03:12+0200 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware,        
OffsiteMiddleware, RefererMidd
leware, UrlLengthMiddleware, DepthMiddleware
Traceback (most recent call last):
File "C:\Python27\Scripts\scrapy", line 5, in <module>
pkg_resources.run_script('Scrapy==0.14.3', 'scrapy')
File "C:\Python27\lib\site-packages\pkg_resources.py", line 489, in run_script
self.require(requires)[0].run_script(script_name, ns)
File "C:\Python27\lib\site-packages\pkg_resources.py", line 1207, in run_script
execfile(script_filename, namespace, namespace)
File "c:\python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\EGG-INFO\scripts\scrapy", line   
4, in <module>
execute()
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\cmdline.py", line 132,   
in execute
run_print_help(parser, _run_command, cmd, args, opts)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\cmdline.py", line 97, in   
_run_print_help
func(*a, **kw)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\cmdline.py", line 139,   
in _run_command
cmd.run(args, opts)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\commands\crawl.py", line   
43, in run
spider = self.crawler.spiders.create(spname, **opts.spargs)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\command.py", line 34,   
in crawler
self._crawler.configure()
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\crawler.py", line 37, in   
configure
self.engine = ExecutionEngine(self, self._spider_closed)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\core\engine.py", line   
62, in __init__
self.scraper = Scraper(crawler)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\core\scraper.py", line   
68, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\middleware.py", line 48,   
in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\middleware.py", line 29,   
in from_settings
mwcls = load_object(clspath)
File "C:\Python27\lib\site-packages\scrapy-0.14.3-py2.7-win32.egg\scrapy\utils\misc.py", line 37,  
in load_object
mod = __import__(module, {}, {}, [''])
File "C:\Python27\testscrapy\tutorial\tutorial\pipelines.py", line 64
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
  ^
SyntaxError: invalid syntax

我不知道从哪里开始,所以任何帮助都非常感谢!

1 个答案:

答案 0 :(得分:1)

tx.execute(\
            "insert into scrapytest (title, link, desc) "
            "values (%s, %s, %s)",
            (item['title'][0],
             item['link'][0],
             item['desc'][0])
        )

需要关闭括号^^

一个好的起点通常是错误指向的行或

之前的行