我是Python Scrapy的新手,我正在尝试从3层嵌套页面创建JSON文件。我有以下结构:
Page 1 (start): contains links of second page (called Mangas)
Page 2: Contains nested Volumes and Chapters
Page 3: Each Chapter contains multiple images
我的代码
import scrapy
import time
import items
import json
class GmangaSpider(scrapy.Spider):
name = "gmanga"
start_urls = [
"http://gmanga.me/mangas"
]
def parse(self, response):
# mangas = []
for manga in response.css('div.manga-item'):
link = manga.css('a.manga-item-content').xpath('@href').extract_first()
if link:
page_link = "http://gmanga.me%s" % link
mangas = items.Manga()
mangas['cover'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('@src').extract_first()
mangas['title'] = manga.css('a.manga-item-content .manga-cover-container img').xpath('@alt').extract_first()
mangas['link'] = page_link
mangas['volumes'] = []
yield scrapy.Request(page_link, callback=self.parse_volumes, meta = {"mangas": mangas})
def parse_volumes(self, response):
mangas = response.meta['mangas']
for manga in response.css('div.panel'):
volume = items.Volume()
volume['name'] = manga.css('div.panel-heading .panel-title a::text').extract_first()
volume['chapters'] = []
for tr in manga.css('div.panel-collapse .panel-body table tbody tr'):
chapter = items.Chapter()
chapter['name'] = tr.css('td:nth_child(1) div::text').extract_first()
chapter_link = tr.css('td:nth_child(3) a::attr("href")').extract_first()
chapter['link'] = chapter_link
request = scrapy.Request("http://gmanga.me%s" % chapter_link, callback = self.parse_images, meta = {"chapter": chapter})
yield request
volume['chapters'].append(chapter)
mangas['volumes'].append(volume)
yield mangas
def parse_images(self, response):
chapter = response.meta['chapter']
data = response.xpath("//script").re("alphanumSort\((.*])")
if data:
images = json.loads(data[0])
chapter['images'] = images
return chapter
我的Items.py
from scrapy import Item, Field
class Manga(Item):
title = Field()
cover = Field()
link = Field()
volumes = Field()
class Volume(Item):
name = Field()
chapters = Field()
class Chapter(Item):
name = Field()
images = Field()
link = Field()
现在我在parse_volumes
函数中有点困惑,在json文件中产生或返回以获取以下结构。
预期结果:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/151/medium_143061.jpg",
"link": "http://gmanga.me/mangas/gokko",
"volumes": [{
"name": "xyz",
"chapters": [{
"link": "/mangas/gokko/4/3asq",
"name": "4",
"images": ["img1.jpg", "img2.jpg"]
}, {
"link": "/mangas/gokko/3/3asq",
"name": "3",
"images": ["img1.jpg", "img2.jpg"]
}]
}],
"title": "Gokko"
}]
但我将图像节点作为单独的节点,它必须在卷的章节节点内:
[{
"cover": "http://media.gmanga.me/uploads/manga/cover/10581/medium_I2.5HFzVh7e.png",
"link": "http://gmanga.me/mangas/godess-creation-system",
"volumes": [{
"name": "\u0627\u0644\u0645\u062c\u0644\u062f ",
"chapters": [{
"link": "/mangas/godess-creation-system/1/ayou-cahn",
"name": "1"
}]
}],
"title": "Godess Creation System"
},
{
"images": ["http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/01.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/02.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/03.jpg?ak=p0skml", "http://media.gmanga.me/uploads/releases/lolly-pop/047-20160111235059UXYGJACW/04.jpg?ak=p0skml"],
"link": "/mangas/reversal/1/Lolly-Pop",
"name": "1"
}]
每个函数都是正确地单独提取数据,唯一的问题是JSON形成。它没有正确写入json文件。请带我到我错的地方。