Python - Scrapy抓取myrecipes.com问题

时间:2016-01-22 15:25:33

标签: python scrapy web-crawler

我正在尝试抓食谱网站:我的recipes.com为了提取我将存储在Android应用程序中的sqlite db中的食谱细节。除了配方持续时间之外,我能够提取所有配方详细信息。这里的问题并非所有食谱都采用相同的格式;一些包含烹饪时间和准备时间,一些包含总时间和一些不包括。下面是我用来抓取网站的代码以及我定位的持续时间的html代码。

我试图运行代码,但输出不会注册。我怀疑问题出在if-else语句中,我需要这个语句来考虑不同的配方格式。任何帮助将不胜感激。

from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule

from myrecipes.items import MyrecipesRecipe, Ingredient, Nutrients

class MyrecipesSpider(CrawlSpider):
    name = "myrecipes" # name of the spider to be used when crawling
    allowed_domains = ["myrecipes.com"] # where the spider is allowed to go
    start_urls = ["http://www.myrecipes.com/recipe/indian-chickpea-vegetable-stew"]

def parse(self, response):
    sel = Selector(response) # the selector
    recipe = MyrecipesRecipe()

    # Name
    recipe['name'] = sel.xpath("substring-before(//title/text(),' Recipe')").extract()

    # Cuisine
    recipe['cuisine'] = "Indian"

    # Ingredients
    ingredients = []
    ingredient_nodes = sel.xpath('//*[@class = "panel-pane pane-entity-field pane-node-field-ingredients"]/div/div')

    for ingredient_node in ingredient_nodes:
        try:
            name = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "name"]/text()').extract()
            quantity = ingredient_node.xpath('//div[@class = "field-ingredients"]/div/div/span[@itemprop = "amount"]/text()').extract()
        except:
            continue

        ingredient = Ingredient()
        ingredient['name'] = name
        ingredient['quantity'] = quantity
        ingredients.append(ingredient)

    recipe['ingredients'] = ingredients

    # Directions
    instructions = []
    instruction_nodes = sel.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]')

    for instruction_node in instruction_nodes:
        try:
            instruction_step = instruction_node.xpath('//div[@itemprop = "instructions"]/div[@class = "field-instructions"]/div/div[@class = "field-item even"]/*/text()').extract()
        except:
            continue
        instructions.append(instruction_step)

    recipe['instructions'] = instructions

    # Nutritional Info
    nutrients = []
    nutrient_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-nutrition-data"]/div/div[@itemprop = "nutrition"]')

    for nutrient_node in nutrient_nodes:
        try:
            name = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains (@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/text()').extract()
            quantity = nutrient_node.xpath('//div[@class = "field-nutrition-data"]/div[contains(@class, "field-collection-view clearfix view-mode-recipe-nutrition")]/div/span/text()').extract()
        except:
            continue

        nutrient = Nutrients()
        nutrient['name'] =  name
        nutrient['quantity'] = quantity
        nutrients.append(nutrient)
    nutrient_name = []
    x = nutrients[0].get('name')
    for i in x:
        if i != "\n":
            nutrient_name.append(i)
    nutrients[0]['name'] = nutrient_name

    recipe['nutrients'] = nutrients

    # Recipe time
    duration_nodes = sel.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div[@class = "field-collection-container clearfix"]')

    for duration_node in duration_nodes:
        try:
            path = duration_node.xpath('//div[@class = "panel-pane pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = "pane-content"]/div/div[@class = "field-recipe-time"]/div/div/span[1]/text()').extract()
            if path == 'Prep: ':
                recipe['prep_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
            elif path == 'Cook: ':
                recipe['cook_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
            elif path == 'Total: ':
                recipe['total_time'] = duration_node.xpath('//div[@class = "field-recipe-time"]/div/div/span[2]/text()').extract()
        except:
            continue


    # Number of Servings
    recipe['servings'] = sel.xpath("substring-after(//div[@class = 'panel-pane pane-entity-field pane-node-field-yield']/div[@class = 'pane-content']/div[@itemprop = 'yield']/div[@class = 'field-yield']/text(), ': ')").extract()

    return recipe

HTML片段:

<div class="panel-pane pane-entity-field pane-node-field-recipe-time recipe-time">
    <h2 class="pane-title">Recipe Time</h2>

  <div class="pane-content">
        <div class="field-collection-container clearfix">
      <div class="field-recipe-time">
        <div class="field-collection-view clearfix view-mode-recipe-time">
        <div class="recipe-time-info">
          <span class="recipe-time-text">Prep: </span>
          <span class="recipe-time-duration">25 Minutes</span>
        </div>
        </div>  </div>
          <div class="field-recipe-time">
            <div class="field-collection-view clearfix view-mode-recipe-time field-collection-view-final">
        <div class="recipe-time-info">
          <span class="recipe-time-text">Cook: </span>
          <span class="recipe-time-duration">45 Minutes</span>
        </div>
        </div>  </div>
        </div>  </div>


          </div>

1 个答案:

答案 0 :(得分:-1)

问题在于你的xpath构造。更常见的是,当存在大量html元素时,丢失任何元素的可能性会增加。如果您发现构建相对xpath很困难,我建议您使用浏览器的xpath选择器。您可以右键单击元素并选择相对xpath。试试这个:

    duration_nodes = sel.xpath('//*[@id="block-system-main"]/div/div[4]/div[1]/div/div[3]/div/div/div')
    for duration_node in duration_nodes:
        try:
            path = ''.join(duration_node.xpath('div/div/span[1]/text()').extract())
            if path == 'Prep: ':
                recipe['prep_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
            elif path == 'Cook: ':
                recipe['cook_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
            elif path == 'Total: ':
                 recipe['total_time'] = duration_node.xpath('div/div/span[2]/text()').extract()
        except:
            continue