在Scrapy中使用嵌套解析器时保存的重复项

时间:2019-06-12 08:47:52

标签: python json scrapy

我在Scrapy及其输出项目的方式上遇到问题。

这是我的item.py:

import scrapy
class Club(scrapy.Item):
    name = scrapy.Field()
    url = scrapy.Field()
    logo = scrapy.Field()
    players = scrapy.Field()

这是我唯一的蜘蛛:

import scrapy
from lequipefr.items import Club


class NamesSpider(scrapy.Spider):
    name = "names"
    allowed_domains = ['lequipe.fr']

    def start_requests(self):
        urls = ['https://www.lequipe.fr/Football/FootballFicheClub26.html']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_club)


    def parse_club(self, response):
        club = Club()
        club['url'] = response.url
        club['name'] = response.css('.nom_sportif::text').get()
        club['logo'] = response.css('.visuels-club').xpath('./figure/img/@src').get()
        club['players'] = []
        for href in response.css('.effectifclub').css('.nom').xpath('./a/@href').getall():
            request = response.follow(href, callback=self.parse_player)
            request.meta['item'] = club
            yield request


    def parse_player(self, response):
        club = response.meta['item']
        playerDict = {}
        playerDict['url'] = response.url
        playerDict['name'] = response.css('.nom_sportif::text').get()
        playerDict['number'] = response.css('.identite').xpath("//*[contains(text(), 'Numéro')]").xpath('./strong/text()').get()
        playerDict['photo'] = response.css('.visuel').xpath('./figure/img/@src').get()
        club['players'].append(playerDict)
        yield club

这是我的JSON输出:

[
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur26337.html", "name": "Edinson Cavani", "number": "9", "photo": "//medias.lequipe.fr/img-sportif-foot/26337/110"}]},
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur26337.html", "name": "Edinson Cavani", "number": "9", "photo": "//medias.lequipe.fr/img-sportif-foot/26337/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur58221.html", "name": "Kylian Mbapp\u00e9", "number": "7", "photo": "//medias.lequipe.fr/img-sportif-foot/58221/110"}]}
]

相反,这就是我希望的输出:

[
{"url": "https://www.lequipe.fr/Football/FootballFicheClub26.html", "name": "PARIS-SG (PSG)", "logo": "//medias.lequipe.fr/logo-football/26/200", "players": [{"url": "https://www.lequipe.fr/Football/FootballFicheJoueur35846.html", "name": "Alphonse Areola", "number": "16", "photo": "//medias.lequipe.fr/img-sportif-foot/35846/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45461.html", "name": "Adrien Rabiot", "number": "25", "photo": "//medias.lequipe.fr/img-sportif-foot/45461/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur47797.html", "name": "Leandro Paredes", "number": "8", "photo": "//medias.lequipe.fr/img-sportif-foot/47797/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur45183.html", "name": "Marco Verratti", "number": "6", "photo": "//medias.lequipe.fr/img-sportif-foot/45183/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur57102.html", "name": "Christopher Nkunku", "number": "24", "photo": "//medias.lequipe.fr/img-sportif-foot/57102/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur27703.html", "name": "Angel Di Maria", "number": "11", "photo": "//medias.lequipe.fr/img-sportif-foot/27703/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur26337.html", "name": "Edinson Cavani", "number": "9", "photo": "//medias.lequipe.fr/img-sportif-foot/26337/110"}, {"url": "https://www.lequipe.fr/Football/FootballFicheJoueur58221.html", "name": "Kylian Mbapp\u00e9", "number": "7", "photo": "//medias.lequipe.fr/img-sportif-foot/58221/110"}]}
]

如您所见,我没有将“ player”字典附加到相同的项目上,而是只产生一次,而是在.json输出文件中为每次迭代添加了项目重复项。

我该如何在我的商品中获得这种嵌套结构而在输出中没有重复项?

1 个答案:

答案 0 :(得分:1)

您为每个玩家提供了一个带有玩家的物品,所以是的,您的期望输出会有问题。

我建议您使用inline_requests库。文档在此处:https://pypi.org/project/scrapy-inline-requests/,它使您可以从父函数向播放器页面发出请求,并在父函数中返回结果。

检查此工作解决方案:

# -*- coding: utf-8 -*-
import scrapy
from inline_requests import inline_requests


class NamesSpider(scrapy.Spider):
    name = "names"
    allowed_domains = ['lequipe.fr']

    def start_requests(self):
        urls = ['https://www.lequipe.fr/Football/FootballFicheClub26.html']
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_club)

    @inline_requests
    def parse_club(self, response):
        club = {}
        club['url'] = response.url
        club['name'] = response.css('.nom_sportif::text').get()
        club['logo'] = response.css('.visuels-club').xpath('./figure/img/@src').get()
        club['players'] = []
        for href in response.css('.effectifclub').css('.nom').xpath('./a/@href').getall():
            url = response.urljoin(href)
            request = yield scrapy.Request(url)
            playerDict = {}
            playerDict['url'] = url
            playerDict['name'] = request.css('.nom_sportif::text').get()
            playerDict['number'] = request.css('.identite').xpath(u"//*[contains(text(), 'Numéro')]").xpath(
                './strong/text()').get()
            playerDict['photo'] = request.css('.visuel').xpath('./figure/img/@src').get()
            club['players'].append(playerDict)

        yield club