scrapy - 下载图像而不压缩图片

时间:2017-12-03 09:16:59

标签: python scrapy scrapy-pipeline

我试图在没有压缩的情况下下载一些图像 例如http://p1.pstatp.com/origin/433c000159def0223671
这张照片约为2.0MB 当我使用scrapy下载它时,它只有120Kb

settings.py

BOT_NAME = 'toutiao'
SPIDER_MODULES = ['toutiao.spiders']
NEWSPIDER_MODULE = 'toutiao.spiders'
IMAGES_STORE = './images/'
MEDIA_ALLOW_REDIRECTS = True
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
   'Accept-Language': 'en',
   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36X-Requested-With:XMLHttpRequest'
}
ITEM_PIPELINES = {'toutiao.pipelines.ToutiaoPipeline': 300,}

items.py

import scrapy
class ToutiaoItem(scrapy.Item):
    keyword = scrapy.Field()
    title = scrapy.Field()
    urls = scrapy.Field()

spiders.py

import scrapy
from scrapy import Request
from toutiao.items import ToutiaoItem
from urllib.parse import  urlencode
import json
import re

class ToutiaopicSpider(scrapy.Spider):
    name = 'toutiaopic'
    allowed_domains = ['toutiao.com']

    keyword = '佳片欣赏·人像'
    param={'offset': 0,
          'format': 'json',
          'keyword': keyword,
          'autoload': 'true',
          'count': '20',
          'cur_tab': '1'}

    url = 'https://www.toutiao.com/search_content/?' + urlencode(param)
    start_urls = [url,]

    def parse(self, response):

        if response.status == 200 :
            data = json.loads(response.body.decode('utf-8'))
            #yield Request(url=data.get('data')[0]['article_url'],callback=self.find_pic)

            if 'data' in data.keys():
                for item in data.get('data'):
                    url = item.get('article_url')
                    if url:
                        yield Request(url,callback=self.find_pic)

        # get more
        if self.param['offset'] < 20:
            self.param['offset'] += 20
            #print('data是',self.data['offset'])
            url = 'https://www.toutiao.com/search_content/?' + urlencode(self.param)
            yield Request(url,callback=self.parse)


    def find_pic(self,response):
        title = response.xpath('//title/text()').extract()[0]
        html = response.body.decode('utf-8').replace('\\','')

        if 'gallery: JSON.parse' in html:
            images_pattern = re.compile('"url_list".*?"url":"(.*?)"},', re.S)
            urls = re.findall(images_pattern, html)
        else:
            img_pattern = re.compile(r'&quot;(http.*?)&quot;',re.S)
            urls = re.findall(img_pattern,html)

        item = ToutiaoItem()
        item['keyword'] = self.keyword
        item['urls'] = urls
        item['title'] = title
        #print('打印item',item['image_urls'],item['title'])
        yield item

pipelines.py

from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
import re

class ToutiaoPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        self.item = item
        for url in item['urls']:
            self.index = 0
            yield  Request(url=url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

    def file_path(self, request, response=None, info=None):
        item = self.item
        keyword = re.sub(r'[?\\*|“<>:/]', '',item['keyword'])
        title = re.sub(r'[?\\*|“<>:/]', '',item['title'])
        image_name = title+str(int(self.index))
        self.index += 0.5
        return '%s/%s.png' % (keyword,image_name)

我想下载原始图片。我该怎么办 ?

0 个答案:

没有答案