我已经在scrapy中创建了一个在同一进程中运行两个蜘蛛的web scraper,但是当我使用命令运行终端中的代码时:scrapy crawl然后我得到:ImportError:无法导入名称CrawlerRunner。 我之前尝试过其他方法来同时运行多个scrapy蜘蛛SO post,但无法解决问题。我认为也许使用扭曲的图书馆可能会有效,但显然可能存在一些我不具备的依赖性,但我无法确定。是否需要安装一些特殊的东西才能导入库?我已经安装了python-twisted。蜘蛛的代码如下:
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem2
bandname = raw_input("Enter a bandname \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
sg_url = "http://www.seatgeek.com/" + bandname + "-tickets"
class MySpider(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
ticketPrice = ''.join(price_list[0])
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider2(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator2'
allowed_domains = ["www.seatgeek.com/"]
start_urls = [sg_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json2(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
listings_info = jsonresponse.get('listings')
price_list = [i.get('pf') for i in ticket_info]
ticketPrice = price_list[0]
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price2(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id= ticketsLink.split('/')[6]
json_url = "https://seatgeek.com/listings?client_id=MTY2MnwxMzgzMzIwMTU4&id=" + json_id + "&_wt=1&&_=1436364489501"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse2(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/a[@class = "event-listing-title"]/span[@itemprop = "name"]/text()')
loader.add_xpath('eventLocation' , './/a[@class = "event-listing-venue-link"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , '//a[@class = "event-listing-button"]/@href')
loader.add_xpath('eventDate' , '//div[@class = "event-listing-date"]/text()')
loader.add_xpath('eventCity' , './/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventCountry' , './/span[@itemprop = "addressCountry"]/text()')
loader.add_xpath('eventTime' , '//div[@class = "event-listing-time"]/text()')
#ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
tickets_url = "www.seatgeek.com/" + loader.get_output_value("ticketsLink")
#ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(tickets_url, meta={'loader': loader}, callback = self.parse_price2, dont_filter = True)
process = CrawlerProcess()
process.crawl(MySpider)
process.crawl(MySpider2)
process.start()