for循环不迭代

时间:2016-07-02 05:56:16

标签: python scrapy

我完全被这个错误所困扰。我试图从此页面https://www.alloresto.fr/restaurant-livraison-a-domicile/restaurant/pizza-mia/angers-centre-ville/particuliers/carte中提取所有菜单项。当它到达最里面的for循环时,它会经历一次迭代然后继续前进。这是非常意外的,我不知道是什么造成的。以下是专门用于解析此页面的函数。

def get_menu(self, response):
    image_url = response.urljoin(response.xpath('//span/img/@src').extract_first())
    for menu_section in response.xpath("//div[@id = 'contenu_choixplats']/div"):
        menu_section_name = menu_section.xpath('dl/dt/text()').extract_first()
        for menu_item in menu_section.xpath('ul/li'):
            item = Restaurant()
            item['restaurant_url'] = response.url
            item['restaurant_name'] = response.request.meta['restaurant_name']
            item['street_name'] = response.request.meta['street_name']
            item['street_number'] = response.request.meta['street_number']
            item['city'] = response.request.meta['city']
            item['zip_code'] = response.request.meta['zip_code']
            item['food_type'] = response.request.meta['food_type']
            item['image_urls'] = [image_url]
            item['menu_category'] = menu_section_name
            item['menu_item_title'] = menu_item.xpath('div/h3/text()').extract()
            item['menu_item_details'] = menu_item.xpath('div/p/text()').extract_first()
            item['menu_item_price'] = menu_item.xpath('div').css('div.product-price-with-offer').xpath('p/text()').extract_first()
            yield item

你看到我遗失的任何东西吗?谢谢你的时间。

- UPDATE ---

这是完整的代码。我提供它以防问题出在get_menu函数之外。请注意,只有在从网站索引深入挖掘两页后才能找到get_menu函数。 蜘蛛/ alloresto_spider.py     进口scrapy     进口重新     来自french_scraping.items进口餐厅

class DmozSpider(scrapy.Spider):
    name = "alloresto"
    allowed_domains = ['alloresto.fr']
    start_urls = ["https://www.alloresto.fr/livraison/villes/"]

    def parse(self, response):
        for sel in response.xpath('//ul/li/a/@href'):
            url = response.urljoin(sel.extract())
            yield scrapy.Request(url, callback=self.restaurants_for_this_city)

    def restaurants_for_this_city(self, response):
        for restaurant in response.xpath('//article/div'):
            restaurant_url = response.urljoin(restaurant.xpath('a/@href').extract_first())
            restaurant_name = restaurant.xpath('div/section[@class="restaurantDetails"]/h3/a/text()').extract_first()
            full_address = restaurant.xpath('div/section[@class="restaurantDetails"]/address/text()').extract_first()
            extracts = re.search(r'^([\d-]*?)\W(.*?),\W(.*?)\W(\d\d\d\d\d)', full_address)
            try:
                street_number = extracts.group(1)
            except:
                continue
            street_name = extracts.group(2)
            city = extracts.group(3)
            zip_code = extracts.group(4)
            food_type = restaurant.xpath('div/section/p').css('.restaurantCuisines').xpath('text()').extract()
            meta_data = {
                'restaurant_url': restaurant_url,
                'restaurant_name': restaurant_name,
                'street_number': street_number,
                'street_name': street_name,
                'city': city,
                'zip_code': zip_code,
                'food_type': food_type}
            yield scrapy.Request(restaurant_url, meta=meta_data, callback=self.get_menu)
        # get info on next page
        next_page = response.css('.next').xpath('a/@href').extract()
        if len(next_page) > 0:
            url = response.urljoin(next_page[0])
            yield scrapy.Request(url, callback=self.restaurants_for_this_city)

    def get_menu(self, response):
        image_url = response.urljoin(response.xpath('//span/img/@src').extract_first())
        for menu_section in response.xpath("//div[@id = 'contenu_choixplats']/div"):
            menu_section_name = menu_section.xpath('dl/dt/text()').extract_first()
            for menu_item in menu_section.xpath('ul/li'):
                item = Restaurant()
                item['restaurant_url'] = response.url
                item['restaurant_name'] = response.request.meta['restaurant_name']
                item['street_name'] = response.request.meta['street_name']
                item['street_number'] = response.request.meta['street_number']
                item['city'] = response.request.meta['city']
                item['zip_code'] = response.request.meta['zip_code']
                item['food_type'] = response.request.meta['food_type']
                item['image_urls'] = [image_url]
                item['menu_category'] = menu_section_name
                item['menu_item_title'] = menu_item.xpath('div/h3/text()').extract()
                item['menu_item_details'] = menu_item.xpath('div/p/text()').extract_first()
                item['menu_item_price'] = menu_item.xpath('div').css('div.product-price-with-offer').xpath('p/text()').extract_first()
                yield item

items.py

import scrapy

class Restaurant(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    restaurant_url = scrapy.Field()
    street_number = scrapy.Field()
    restaurant_name = scrapy.Field()
    street_name = scrapy.Field()
    city = scrapy.Field()
    zip_code = scrapy.Field()
    food_type = scrapy.Field()
    menu_category = scrapy.Field()
    menu_item_title = scrapy.Field()
    menu_item_details = scrapy.Field()
    menu_item_price = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
    pass

settings.py

BOT_NAME = 'french_scraping'
SPIDER_MODULES = ['french_scraping.spiders']
NEWSPIDER_MODULE = 'french_scraping.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_STORE = '/Users/drew/Desktop/frenchscraping/french_scraping'

pipelines.py

class FrenchScrapingPipeline(object):
    def process_item(self, item, spider):
        return item

1 个答案:

答案 0 :(得分:0)

get_menu(self,response)是一个生成器,因为它使用了yield语句。

当调用生成器函数时,它返回一个生成器对象,而不开始执行该函数。要使代码运行,您需要在'for'循环中使用get_menu(self,response)或将其传递给任何迭代的函数或构造。