Spider在scrapy shell中擦除数据,但不在脚本中

时间:2015-07-11 18:36:26

标签: python html json web-scraping scrapy

我写了一个蜘蛛,使用来自另一个正常工作的蜘蛛的代码从this website抓取票证信息(只需要更改xpath和url。当我在终端中使用url时,我能够访问响应并从中提取页面上的不同元素。当我运行蜘蛛脚本时,我收到此错误:

RLError: 2015-07-11 14:12:31 [boto]错误:无法读取实例数据,放弃

并且蜘蛛在打开后基本上没有刮掉任何元素。我已经测试了所有的xpath并且它们是正确的,我甚至直接从网页上复制了url。这个网站有什么不同之处,不允许它被删除或代码有问题吗?顺便说一句,我能够从中获取数据的网站有一个以.html结尾的网址,所以可能只有那些类型的网站可以被抓取或者其他东西。感谢任何帮助,谢谢。

import sys
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from seat_geeks_scraper.items import ComparatorItem
from urlparse import urljoin

bandname = raw_input("Enter a bandname \n")
sg_url = "http://www.seatgeeks.com/" + bandname + "-tickets"
class MySpider2(CrawlSpider):
    handle_httpstatus_list = [416]
    name = 'comparator'
    allowed_domains = ["seatgeek.com"]
    start_urls = [sg_url]
    tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'

    def parse_json2(self, response):
        loader = response.meta['loader']
        jsonresponse = json.loads(response.body_as_unicode())
        listings_info = jsonresponse.get('listings')
        price_list = [i.get('pf') for i in ticket_info]
        ticketPrice = price_list[0]
        loader.add_value('ticketPrice', ticketPrice)
        return loader.load_item()

    def parse_price2(self, response):
        loader = response.meta['loader']
        ticketsLink = loader.get_output_value("ticketsLink")
        json_id= ticketsLink.split('/')[6]
        json_url = "https://seatgeek.com/listings?client_id=MTY2MnwxMzgzMzIwMTU4&id=" + json_id + "&_wt=1&&_=1436364489501"
        yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True) 

    def parse2(self, response):
        """

        # """
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):
            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader
            loader.add_xpath('eventName' , './/a[@class = "event-listing-title"]/span[@itemprop = "name"]/text()')
            loader.add_xpath('eventLocation' , './/a[@class = "event-listing-venue-link"]/span[@itemprop = "name"]/text()')
            loader.add_xpath('ticketsLink' , '//a[@class = "event-listing-button"]/@href')
            loader.add_xpath('eventDate' , '//div[@class = "event-listing-date"]/text()')
            loader.add_xpath('eventCity' , './/span[@itemprop  = "addressLocality"]/text()')
            loader.add_xpath('eventState' , './/span[@itemprop  = "addressRegion"]/text()')
            loader.add_xpath('eventCountry' , './/span[@itemprop  = "addressCountry"]/text()')
            loader.add_xpath('eventTime' , '//div[@class = "event-listing-time"]/text()')

            #ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
            tickets_url = "www.seatgeek.com/" + loader.get_output_value("ticketsLink")
            #ticketsURL = urljoin(response.url, ticketsURL)
            yield scrapy.Request(tickets_url, meta={'loader': loader}, callback = self.parse_price2, dont_filter = True)

enter image description here

enter image description here

0 个答案:

没有答案