爬行图像失败

时间:2013-05-26 16:34:43

标签: python web-scraping scrapy

我正在尝试使用以下scrapy代码从网站抓取图像:

import urlparse
from PIL import Image
from scrapy.exceptions import DropItem, NotConfigured, IgnoreRequest
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request

from scrapy.contrib.pipeline.images import ImagesPipeline
from mobile.items import Website

class MobileSpider(CrawlSpider):
    name = "mobile"
    allowed_domains = ["mobile-store.ro"]
    start_urls = ["http://www.mobile-store.ro/produse/"]
    rules = (
        Rule(SgmlLinkExtractor(allow=r"/produs/d+"), follow=True),
        Rule(SgmlLinkExtractor(allow=r"/produse/d+"), callback='parse_item')
    )

    def parse(self, response, response2):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//ul[@class='products']/li/a/@href").extract()
        if not not next_page:
            yield Request(next_page[0], self.parse)
        sites = hxs.select('//div[@id="wrapper"]/div[@id="content"]')
        items = []

        for site in sites:
            item = Website()
            item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
            item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
            item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
            item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
            image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
            item['image_urls'] = [urlparse.urljoin(response.url,image_relative_url)]
            #item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
            item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
            item['url'] = response.url
            items.append(item)
        for item in items:
            yield item

settings.py:
SPIDER_MODULES = ['mobile.spiders']
NEWSPIDER_MODULE = 'mobile.spiders'
DEFAULT_ITEM_CLASS = 'mobile.items.Website'

ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']

items.py:
from scrapy.item import Item, Field

class Website(Item):
    nume = Field()
    descriere = Field()
    categorie = Field()
    brand = Field()
    pret = Field()
    url = Field()
    image_urls = Field()
    images = Field()
    image_paths = Field()

pipelines.py:
from mobile.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

当我尝试使用以下代码获取图片网址时出现问题:

for site in sites:
        item = Website()
        item['nume'] = site.select('//div[@class="summary"]/h1[@class="product_title entry-title"]/text()').extract()
        item['categorie'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="posted_in"]/a/text()').extract()
        item['brand'] = site.select('//div[@class="summary"]/div[@class="product_meta"]/span[@class="tagged_as"]/a/text()').extract()
        item['descriere'] = site.select('//div[@class="woocommerce_tabs"]/div[@id="tab-description"]/p/text()').extract()
        image_relative_url = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
        item['image_urls'] = [urlparse.urljoin(response.url2,image_relative_url)]
        #item['image_urls'] = site.select('//div[@class="ad-image-wrapper"]/div[@class="ad-image"]/img[@class="lightbox"]/@src').extract()
        item['pret'] = site.select('//div[@class="summary"]/div[1]/p[@class="price"]/span[@class="amount"]/text()').extract()
        item['url'] = response.url
        items.append(item)
    for item in items:
        yield item

返回页面网址而不是图片网址。正确爬网所有其他字段。有关如何解决此问题并正确获取图像网址的任何线索?

1 个答案:

答案 0 :(得分:0)

这是因为图像(以及ad-image-wrapper div的全部内容)是通过javascript动态填充的。

response.body方法中转储parse帮助我弄清楚实际的图片链接最初保存在ad-thumb-list列表中。因此,请尝试使用以下方法获取图片网址:

image_relative_url = site.select('//ul[@class="ad-thumb-list"]/li[@class="first_item"]/a/@href').extract()
if image_relative_url:
    image_relative_url = image_relative_url[0]

希望这就是你所需要的。