我编写了一个运行scrapy蜘蛛的脚本,该蜘蛛位于不同的目录中。该脚本接受用户输入,解析它并将其添加到要删除的URL。该脚本似乎提前工作,但现在我收到以下错误:
URLError: <urlopen error [Errno 101] Network is unreachable>
ERROR: Unable to read instance data, giving up
使用scrapy crawl
命令运行时,蜘蛛的代码可正常工作,但由于某种原因从脚本运行时无效。
以下是从脚本(位于蜘蛛文件中)运行蜘蛛的函数的代码:
def spiderCrawl(bandname):
aSpider = MySpider3()
aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
创建网址的函数:
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
start_urls = [tc_url]
此外,下面是带有错误消息的终端的图像。输入随机带名的事实表明,首先甚至没有读取网址。这可能是什么问题?任何帮助将不胜感激,谢谢。
更新
所以似乎问题是我的spider类中的create_link方法没有正确地添加到start_urls列表的链接,但是当我在内部使用raw_input语句时,脚本似乎正在运行蜘蛛。蜘蛛文件而不是脚本。将用户输入的参数传递给蜘蛛文件以将其添加为链接的正确方法是什么?我有蜘蛛的代码和运行下面蜘蛛的脚本,以使帖子更完整:
脚本代码
from ticket_city_scraper.ticket_city_scraper import *
from ticket_city_scraper.ticket_city_scraper.spiders import tc_spider
bandname = raw_input("Enter bandname\n") # I took out this line and added it to the spider file to make the script work
tc_spider.spiderCrawl(bandname)
蜘蛛文件
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
def spiderCrawl(bandname):
# process = CrawlerProcess({
# 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# })
# process.crawl(aSpider)
# process.start()
aSpider = MySpider3()
#aSpider.create_link(bandname)
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()
d = runner.crawl(aSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
答案 0 :(得分:1)
我只能猜测,因为你没有提供here。不过,我会在您的函数create_link
中说出这一行:
start_urls = [tc_url]
应该是:
self.start_urls = [tc_url]