我在不同的方法中嵌套了请求。问题在于使用Items。 这是我目前的简化代码:
...
def start_requests(self):
url = "http://example.com"
yield scrapy.Request(url=url)
def parse(self, response):
films_urls = response.css('a::attr(href)').extract()
for href in films_urls:
yield response.follow(href, callback=self.film)
def film(self, response):
film = Film()
# add url and tmdb_id
film['list_urls'] = []
open_stream = response.css('.show::attr(data-href)').extract_first()
request = response.follow(open_stream, callback=self.streamers)
request.meta['film'] = film
yield request
def streamers(self, response):
film = response.meta['film']
urls = response.css('a.aporte::attr(href)').extract()
for href in urls: # list with all the urls to add to the dictionary
request = response.follow(href, callback=self.infostream)
request.meta['film'] = film
yield request
yield film #
def infostream(self, response):
film = response.meta['film']
stream = Stream()
# add lang and web
redirect = response.css('div.visit a::attr(href)').extract_first()
request = response.follow(redirect, callback=self.redirect, meta={'dont_redirect': True})
request.meta['stream'] = stream
yield request
film['list_urls'].append(stream)
return film
def redirect(self, response):
stream = response.meta['stream']
# Any better way to get the redirected url?
stream['url'] = response.headers['Location'].decode('UTF-8')
return stream
项目:
class Film(scrapy.Item):
url = scrapy.Field()
tmdb_id = scrapy.Field()
list_urls = scrapy.Field()
class Stream(scrapy.Item):
lang = scrapy.Field()
web = scrapy.Field()
url = scrapy.Field()
我正在等待一个电影对象,其中包含一个包含所有可用网址的字典。但它永远不会等待它们被添加并将其返回为空。
修改 如果我把"返回电影"在所有方法中,它返回带有字典的对象,但每次添加一个而不是完成时。