所以,我有这个让我疯狂的问题,我试图通过管道将已删除的项目存储到MySQL中,但我无法做到。
如果我只存储1个项目,我可以做到,但第二个我添加第二个项目我得到这个奇怪的错误。
Error 1064: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '), 1)' at line 2
所以我得到上面的错误,我在pipelines.py中的代码是:
class DropToDb(object):
def __init__(self):
self.conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="Test")
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""
INSERT INTO Main (url, domain_id)
VALUES (%s, %s)
""", (item['url'], item['domain_id']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
如果我删除一个表和项目比它工作得好,如下所示。
class DropToDb(object):
def __init__(self):
self.conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="Test")
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""
INSERT INTO Main (url)
VALUES (%s)
""", (item['url']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
My Scrapy文件如下所示:
if datematch:
item['link_title'] = ogtitle
item['link_description'] = response.xpath('//meta[@property="og:description"]/@content').extract()
item['link_locale'] = response.xpath('//meta[@property="og:locale"]/@content').extract(),
yield item
上面有更多项目,但我只是想要一些例子。
有人可以帮我摆脱这个吗?
我的蜘蛛文件:
import scrapy
import MySQLdb
from MySQLdb.cursors import SSCursor
from scrapy.http import Request
import re
from Maintoo.items import MaintooSpider2Item
from scrapy.exceptions import DropItem
import datetime
class Maintoospider2Spider(scrapy.Spider):
name = "MaintooSpider2"
#start_urls = readdomainsfromdb()
def start_requests(self):
for domain_id, url, id_sitemap_links in readdomainsfromdb():
yield Request(
url,
callback=self.parse,
meta={
'domain_id': domain_id,
'id_sitemap_links': id_sitemap_links
},
errback=self.error
)
def error(self):
pass
def parse(self, response):
domainid = response.meta['domain_id']
id_sitemap_links = response.meta['id_sitemap_links']
#updateid(id_sitemap_links)
ogtitle = response.xpath('//meta[@property="og:title"]/@content').extract()
isporn = response.xpath('//meta[@content="RTA-5042-1996-1400-1577-RTA"]').extract()
datematch = re.findall(r'(content="2015|2016")', response.body, re.IGNORECASE | re.DOTALL)
item = MaintooSpider2Item()
if '/tag/' in response.url:
raise DropItem
if isporn:
updateporn(domainid)
raise DropItem
if datematch:
item['link_title'] = ogtitle
item['link_description'] = response.xpath('//meta[@property="og:description"]/@content').extract()
item['link_locale'] = response.xpath('//meta[@property="og:locale"]/@content').extract()
item['link_type'] = response.xpath('//meta[@property="og:type"]/@content').extract()
item['link_url'] = response.xpath('//meta[@property="og:url"]/@content').extract()
item['link_site_name'] = response.xpath('//meta[@property="og:site_name"]/@content').extract()
item['link_article_tag'] = response.xpath('//meta[@property="article:tag"]/@content').extract()
item['link_article_section'] = response.xpath('//meta[@property="article:section"]/@content').extract()
item['link_article_published_time'] = response.xpath('//meta[@property="article:published_time"]/@content').extract()
item['link_meta_keywords'] = response.xpath('//meta[@name="keywords"]/@content').extract()
item['link_publisher'] = response.xpath('//meta[@property="article:publisher"]/@content').extract()
item['link_article_author'] = response.xpath('//meta[@property="article:author"]/@content').extract()
item['link_twitter_card'] = response.xpath('//meta[@name="twitter:card"]/@content').extract()
item['link_twitter_description'] = response.xpath('//meta[@name="twitter:description"]/@content').extract()
item['link_twitter_title'] = response.xpath('//meta[@name="twitter:title"]/@content').extract()
item['link_twitter_image'] = response.xpath('//meta[@name="twitter:image"]/@content').extract()
item['link_facebook_app_id'] = response.xpath('//meta[@property="fb:app_id"]/@content').extract()
item['link_facebook_page_admins'] = response.xpath('//meta[@property="fb:admins"]/@content').extract()
item['link_rss'] = response.xpath('//meta[@rel="alternate"]/@href').extract()
item['link_twitter_image_source'] = response.xpath('//meta[@name="twitter:image:src"]/@content').extract()
item['link_twitter_site'] = response.xpath('//meta[@name="twitter:site"]/@content').extract()
item['link_twitter_url'] = response.xpath('//meta[@name="twitter:url"]/@content').extract()
item['link_twitter_creator'] = response.xpath('//meta[@name="twitter:creator"]/@content').extract()
item['link_apple_app'] = response.xpath('//meta[@name="apple-itunes-app"]/@content').extract()
item['link_facebook_video'] = response.xpath('//meta[@property="og:video"]/@content').extract()
item['link_facebook_page_id'] = response.xpath('//meta[@name="fb:page_id"]/@content').extract()
item['link_id'] = response.xpath('//link[@rel="publisher"]/@href').extract()
item['link_image'] = response.xpath('//meta[@property="og:image"]/@content').extract()
item['url'] = response.url
item['domain_id'] = domainid
item['crawled_date'] = datetime.datetime.now().isoformat()
yield item
我的新Pipelines文件:
class dropifdescription(object):
def process_item(self, item, spider):
# to test if only "job_id" is empty,
# change to:
# if not(item["job_id"]):
if not(item["link_title"]):
raise DropItem()
else:
return item
class DropToDb(object):
def __init__(self):
self.conn = MySQLdb.connect(host="localhost", user="root", passwd="root", db="Maintoo", charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""
INSERT INTO Main (url, domain_id, link_title) VALUES (%s, %s, %s)""", (item['url'], item['domain_id'], item['link_title']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
我的设置文件:
ITEM_PIPELINES = {
'Maintoo.pipelines.dropifdescription': 200,
'Maintoo.pipelines.DropToDb': 300,
}
答案 0 :(得分:3)
问题来自你的蜘蛛内部。
item['link_locale'] = response.xpath('//meta[@property="og:locale"]/@content').extract(),
最后看到这个,
- 这会使你的item['link_locale']
成为一个元组,最终会破坏你的SQL查询。删除逗号。
除此之外,您应该使用extract_first()
而不是使用常规extract()
来提取单个值而不是列表。