我已经制作了一个scrapy项目,其中包含一个文件中的多个蜘蛛,并且需要解释器能够区分哪个管道与哪个蜘蛛相关联,类似于询问此SO question的人。使用最高投票答案提供的解决方案,我将装饰器放在管道类中,然后在蜘蛛本身内定义管道列表。但是当我运行它时,我得到一个名称错误,因为管道是在蜘蛛文件中定义的。
由于pipelines.py文件不是模块,因此您无法将其真正导入到spiders.py文件中。我不确定所发布的答案是否仍然是相关的,因为它不是最近的,但似乎它确实适用于某人,所以至少值得一试。顺便说一句,我按照docs中提供的代码顺序运行了两个蜘蛛,即使我使用命令时两个蜘蛛都运行了:scrapy runspider,我认为管道类没有被调用。但是,当我单独运行每个蜘蛛时,表格会正确填充。我还在settings.py dicionary中包含了两个管道类。所以,我有几个问题:
[1。] 我是否根据问题中提供的答案正确设置了两个文件?
[2。] 如果是这样,我将如何正确连接这两个文件的命名空间?
[3。] 除了创建单独的项目之外,还有更好的方法吗?
我有以下两个文件的代码,任何帮助将不胜感激,谢谢。
pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Tickets, Tickets3, db_connect, create_vs_tickets_table, create_tc_tickets_table
class ComparatorPipeline(object):
"""Price comparison pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_vs_tickets_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save tickets in the database.
This method is called for every item pipeline component.
"""
def check_spider_pipeline(process_item_method):
@functools.wraps(process_item_method)
def wrapper(self, item, spider):
#message template for debugging
msg = '%%s %s pipeline step' % (self.__.class__.__name__,)
#if class is in the spider's pipeline, then use the
#process_item method normally.
if self.__class__ in spider.pipeline:
spider.log(msg % 'executing', level=log.DEBUG)
return process_item_method(self, item, spider)
#otherwise, just return the untouched item (skip this step in the pipeline)
else:
spider.log(msg % 'skipping', level= log.DEBUG)
return item
return wrapper
if spider.name == "comparator":
session = self.Session()
ticket = Tickets(**item)
try:
session.add(ticket)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
class ComparatorPipeline2(object):
"""Price comparison pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_tc_tickets_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save tickets in the database.
This method is called for every item pipeline component.
"""
def check_spider_pipeline(process_item_method):
@functools.wraps(process_item_method)
def wrapper(self, item, spider):
#message template for debugging
msg = '%%s %s pipeline step' % (self.__.class__.__name__,)
#if class is in the spider's pipeline, then use the
#process_item method normally.
if self.__class__ in spider.pipeline:
spider.log(msg % 'executing', level=log.DEBUG)
return process_item_method(self, item, spider)
#otherwise, just return the untouched item (skip this step in the pipeline)
else:
spider.log(msg % 'skipping', level= log.DEBUG)
return item
return wrapper
if spider.name == "comparator2":
session = self.Session()
ticket2 = Tickets2(**item)
try:
session.add(ticket2)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
蜘蛛类定义
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem3
from urlparse import urljoin
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
bandname = raw_input("Enter a bandname \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider(CrawlSpider):
pipeline = set([
ComparatorPipeline
])
pipeline = ['first']
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider3(CrawlSpider):
pipeline = set([
ComparatorPipeline2
])
handle_httpstatus_list = [416]
name = 'comparator3'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
configure_logging()
runner = CrawlerRunner()
@defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider)
yield runner.crawl(MySpider3)
reactor.stop()
crawl()
reactor.run()