from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from viagogo.items import ViagogoItem
from scrapy.http import Request
class viagogoSpider(CrawlSpider):
name="viagogo"
allowed_domains=['viagogo.com']
start_urls = ["http://www.viagogo.com/Concert-Tickets"]
rules = (
# Running on each subject in title, such as Rock in music
Rule(SgmlLinkExtractor(restrict_xpaths=('//a[@class="t xs"]')), callback='Parse_Subject_Tickets', follow=True),
)
def Parse_Subject_Tickets(self, response):
item = ViagogoItem()
item["title"] = response.xpath('//title/text()').extract()
item["link"] = response.url
yield Request(response.url, callback =self.Parse_artists_Tickets, meta={"method":"GetGridData"}, dont_filter=True)
def Parse_artists_Tickets(self, response):
print response.body
在规则中获取所有Concert-Tickets / XXXX页面,并在Parse_Subject_Tickets中尝试构建JSON,但在Parse_artists_Tickets中打印后页面完全是原始页面,而不是新艺术家... < / p>
任何想法?
谢谢!