Question

所以，我有这个让我疯狂的问题，我试图通过管道将已删除的项目存储到MySQL中，但我无法做到。

如果我只存储1个项目，我可以做到，但第二个我添加第二个项目我得到这个奇怪的错误。

Error 1064: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '), 1)' at line 2

所以我得到上面的错误，我在pipelines.py中的代码是：

class DropToDb(object):
    def __init__(self):
        self.conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="Test")
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""
                          INSERT INTO Main (url, domain_id)
                          VALUES (%s, %s)
                    """, (item['url'], item['domain_id']))

            self.conn.commit()


        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

        return item

如果我删除一个表和项目比它工作得好，如下所示。

class DropToDb(object):
    def __init__(self):
        self.conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="Test")
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""
                          INSERT INTO Main (url)
                          VALUES (%s)
                    """, (item['url']))

            self.conn.commit()


        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

        return item

My Scrapy文件如下所示：

if datematch:
    item['link_title'] = ogtitle
    item['link_description'] = response.xpath('//meta[@property="og:description"]/@content').extract()
    item['link_locale'] = response.xpath('//meta[@property="og:locale"]/@content').extract(),
    yield item

上面有更多项目，但我只是想要一些例子。

有人可以帮我摆脱这个吗？

我的蜘蛛文件：

import scrapy
import MySQLdb
from MySQLdb.cursors import SSCursor
from scrapy.http import Request
import re
from Maintoo.items import MaintooSpider2Item
from scrapy.exceptions import DropItem
import datetime
class Maintoospider2Spider(scrapy.Spider):
    name = "MaintooSpider2"

    #start_urls = readdomainsfromdb()

    def start_requests(self):
        for domain_id, url, id_sitemap_links in readdomainsfromdb():
            yield Request(
                url,
                callback=self.parse,
                meta={
                    'domain_id': domain_id,
                    'id_sitemap_links': id_sitemap_links
                },
                errback=self.error
            )

    def error(self):
        pass

    def parse(self, response):
        domainid = response.meta['domain_id']
        id_sitemap_links = response.meta['id_sitemap_links']
        #updateid(id_sitemap_links)
        ogtitle = response.xpath('//meta[@property="og:title"]/@content').extract()
        isporn = response.xpath('//meta[@content="RTA-5042-1996-1400-1577-RTA"]').extract()
        datematch = re.findall(r'(content="2015|2016")', response.body, re.IGNORECASE | re.DOTALL)
        item = MaintooSpider2Item()
        if '/tag/' in response.url:
            raise DropItem
        if isporn:
            updateporn(domainid)
            raise DropItem

        if datematch:
            item['link_title'] = ogtitle
            item['link_description'] = response.xpath('//meta[@property="og:description"]/@content').extract()
            item['link_locale'] = response.xpath('//meta[@property="og:locale"]/@content').extract()
            item['link_type'] = response.xpath('//meta[@property="og:type"]/@content').extract()
            item['link_url'] = response.xpath('//meta[@property="og:url"]/@content').extract()
            item['link_site_name'] = response.xpath('//meta[@property="og:site_name"]/@content').extract()
            item['link_article_tag'] = response.xpath('//meta[@property="article:tag"]/@content').extract()
            item['link_article_section'] = response.xpath('//meta[@property="article:section"]/@content').extract()
            item['link_article_published_time'] = response.xpath('//meta[@property="article:published_time"]/@content').extract()
            item['link_meta_keywords'] = response.xpath('//meta[@name="keywords"]/@content').extract()
            item['link_publisher'] = response.xpath('//meta[@property="article:publisher"]/@content').extract()
            item['link_article_author'] = response.xpath('//meta[@property="article:author"]/@content').extract()
            item['link_twitter_card'] = response.xpath('//meta[@name="twitter:card"]/@content').extract()
            item['link_twitter_description'] = response.xpath('//meta[@name="twitter:description"]/@content').extract()
            item['link_twitter_title'] = response.xpath('//meta[@name="twitter:title"]/@content').extract()
            item['link_twitter_image'] = response.xpath('//meta[@name="twitter:image"]/@content').extract()
            item['link_facebook_app_id'] = response.xpath('//meta[@property="fb:app_id"]/@content').extract()
            item['link_facebook_page_admins'] = response.xpath('//meta[@property="fb:admins"]/@content').extract()
            item['link_rss'] = response.xpath('//meta[@rel="alternate"]/@href').extract()
            item['link_twitter_image_source'] = response.xpath('//meta[@name="twitter:image:src"]/@content').extract()
            item['link_twitter_site'] = response.xpath('//meta[@name="twitter:site"]/@content').extract()
            item['link_twitter_url'] = response.xpath('//meta[@name="twitter:url"]/@content').extract()
            item['link_twitter_creator'] = response.xpath('//meta[@name="twitter:creator"]/@content').extract()
            item['link_apple_app'] = response.xpath('//meta[@name="apple-itunes-app"]/@content').extract()
            item['link_facebook_video'] = response.xpath('//meta[@property="og:video"]/@content').extract()
            item['link_facebook_page_id'] = response.xpath('//meta[@name="fb:page_id"]/@content').extract()
            item['link_id'] = response.xpath('//link[@rel="publisher"]/@href').extract()
            item['link_image'] = response.xpath('//meta[@property="og:image"]/@content').extract()
            item['url'] = response.url
            item['domain_id'] = domainid
            item['crawled_date'] = datetime.datetime.now().isoformat()
            yield item

我的新Pipelines文件：

class dropifdescription(object):

    def process_item(self, item, spider):

        # to test if only "job_id" is empty,
        # change to:
        # if not(item["job_id"]):
        if not(item["link_title"]):
            raise DropItem()
        else:
            return item

class DropToDb(object):
    def __init__(self):
        self.conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="Maintoo",  charset="utf8", use_unicode=True)
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        try:
            self.cursor.execute("""
                              INSERT INTO Main (url, domain_id, link_title) VALUES (%s, %s, %s)""", (item['url'], item['domain_id'], item['link_title']))

            self.conn.commit()


        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])

        return item

我的设置文件：

ITEM_PIPELINES = {
    'Maintoo.pipelines.dropifdescription': 200,
    'Maintoo.pipelines.DropToDb': 300,
}

Answer 1

问题来自你的蜘蛛内部。

item['link_locale'] = response.xpath('//meta[@property="og:locale"]/@content').extract(),

最后看到这个, - 这会使你的item['link_locale']成为一个元组，最终会破坏你的SQL查询。删除逗号。

除此之外，您应该使用extract_first()而不是使用常规extract()来提取单个值而不是列表。

从Scrapy将多个项目存储到Mysql中时出现问题

1 个答案: