Scrapy检查是否抓取url返回任何可下载文件

时间:2016-05-19 13:11:39

标签: python-3.x web-scraping scrapy

我是Scrapy的新手,到目前为止还没有找到任何帮助。

我想制作一个可以废弃页面上所有网址的小型剪贴簿,然后逐个点击它们,如果Url返回任何扩展名的任何可下载文件,则将其下载并保存到指定的地点。 这是我写的代码: 的 items.py

import scrapy

class ZcrawlerItem(scrapy.Item):
    file = scrapy.Field()
    file_url = scrapy.Field()

spider.py

from scrapy import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request

DOMAIN = 'example.com'
URL = 'http://%s' % DOMAIN
from crawler.items import CrawlerItem


class MycrawlerSpider(CrawlSpider):
    name = "mycrawler"
    allowed_domains = [DOMAIN]
    start_urls = [
        URL
    ]
    def parse_dir_contents(self, response):
        print(response.headers)
        item = CrawlerItem()
        item['file_url'] = response.url
        return item       

    def parse(self, response):
        hxs = Selector(response)
        for url in hxs.xpath('//a/@href').extract():
            if (url.startswith('http://') or url.startswith('https://')):
                yield Request(url, callback=self.parse_dir_contents)
        for url in hxs.xpath('//iframe/@src ').extract():
            yield Request(url, callback=self.parse_dir_contents)

我面临的问题是parse_dir_contents没有显示标题,因此很难检查响应数据是否是任何可下载文件或只是内容。

BTW我正在使用Scrapy 1.1.0和Python 3.4

任何帮助都会非常感激!!

1 个答案:

答案 0 :(得分:0)

所以在一些R& D之后我找到了解决方案并且在这里更新了spider.py

from scrapy import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request

DOMAIN = 'example.com'
URL = 'http://%s' % DOMAIN
from crawler.items import CrawlerItem


class MycrawlerSpider(CrawlSpider):
    name = "mycrawler"
    allowed_domains = ''
    allowed_mime_type = [b'application/zip', b'application/x-msdownload', b'application/pdf', b'image/jpeg', b'image/jpg',
                     b'image/png', b'application/octet-stream']
    start_urls = [
        URL
    ]
    def parse(self, response):
        hxs = Selector(response)
        Urls = ''
        for url in hxs.xpath('//a/@href').extract():
            if (url.startswith('http://') or url.startswith('https://')):
                yield Request(url, callback=self.parse_item)
            elif 'javascript' not in url:
                new_url = urljoin(response.url, url.strip())
                print("New url : ", new_url)
                yield Request(new_url, callback=self.parse_item)
        for url in hxs.xpath('//iframe/@src ').extract():
            Urls += str(url) + ", "
            yield Request(url, callback=self.parse_item)

    def parse_item(self, response):
        if response.headers['Content-Type'] in self.allowed_mime_type:
            item = ZcrawlerItem()
            item['file_urls'] = response.url
            item['referer'] = str(response.request.headers['referer'].decode("utf-8"))
            yield item
        else:
            self.logger.info('Not found any allowed file type, lets try next page : %s', response.url)
            # self.process_next_url(response)
            yield Request(response.url, callback=self.parse, dont_filter=True)

然后输出将传递给 pipeline.py ,我将在PostgreSql中保存信息并下载文件

import json
import scrapy
import psycopg2
import datetime
import hashlib
import os

try:
    import urllib.request as urllib2
except ImportError:
    import urllib2

FILES_STORE = '<location to save files>'


class Pipeline(object):
    def __init__(self):
        self.conn = psycopg2.connect(user="postgres", password="pass",
                                     dbname="db_name",
                                     host='localhost')

    def process_item(self, item, spider):
        item['path'] = self.write_to_file(item['file_urls'])
        cur = self.conn.cursor()
        cur.execute('''
                insert into scrape ( file_url, referer,path,created_date)
                values (%s, %s,%s, %s);
                ''', [
            item['file_urls'],
            item['referer'],
            item['path'],
            datetime.datetime.now()])
        self.conn.commit()
        return item

    def write_to_file(self, url):
        response = urllib2.urlopen(url)
        directory = FILES_STORE + str(hashlib.md5(url.encode('utf-8')).hexdigest()) + "/"
        if not os.path.exists(directory):
            os.makedirs(directory)
        file_name = url.split('/')[-1]
        with open(directory + str(file_name), "wb") as handle:
            handle.write(response.read())
        return directory + str(file_name)

希望这会有所帮助,欢呼(y)