我是Scrapy的新手,到目前为止还没有找到任何帮助。
我想制作一个可以废弃页面上所有网址的小型剪贴簿,然后逐个点击它们,如果Url返回任何扩展名的任何可下载文件,则将其下载并保存到指定的地点。 这是我写的代码: 的 items.py
import scrapy
class ZcrawlerItem(scrapy.Item):
file = scrapy.Field()
file_url = scrapy.Field()
spider.py
from scrapy import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
DOMAIN = 'example.com'
URL = 'http://%s' % DOMAIN
from crawler.items import CrawlerItem
class MycrawlerSpider(CrawlSpider):
name = "mycrawler"
allowed_domains = [DOMAIN]
start_urls = [
URL
]
def parse_dir_contents(self, response):
print(response.headers)
item = CrawlerItem()
item['file_url'] = response.url
return item
def parse(self, response):
hxs = Selector(response)
for url in hxs.xpath('//a/@href').extract():
if (url.startswith('http://') or url.startswith('https://')):
yield Request(url, callback=self.parse_dir_contents)
for url in hxs.xpath('//iframe/@src ').extract():
yield Request(url, callback=self.parse_dir_contents)
我面临的问题是parse_dir_contents没有显示标题,因此很难检查响应数据是否是任何可下载文件或只是内容。
BTW我正在使用Scrapy 1.1.0和Python 3.4
任何帮助都会非常感激!!
答案 0 :(得分:0)
所以在一些R& D之后我找到了解决方案并且在这里更新了spider.py
from scrapy import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
DOMAIN = 'example.com'
URL = 'http://%s' % DOMAIN
from crawler.items import CrawlerItem
class MycrawlerSpider(CrawlSpider):
name = "mycrawler"
allowed_domains = ''
allowed_mime_type = [b'application/zip', b'application/x-msdownload', b'application/pdf', b'image/jpeg', b'image/jpg',
b'image/png', b'application/octet-stream']
start_urls = [
URL
]
def parse(self, response):
hxs = Selector(response)
Urls = ''
for url in hxs.xpath('//a/@href').extract():
if (url.startswith('http://') or url.startswith('https://')):
yield Request(url, callback=self.parse_item)
elif 'javascript' not in url:
new_url = urljoin(response.url, url.strip())
print("New url : ", new_url)
yield Request(new_url, callback=self.parse_item)
for url in hxs.xpath('//iframe/@src ').extract():
Urls += str(url) + ", "
yield Request(url, callback=self.parse_item)
def parse_item(self, response):
if response.headers['Content-Type'] in self.allowed_mime_type:
item = ZcrawlerItem()
item['file_urls'] = response.url
item['referer'] = str(response.request.headers['referer'].decode("utf-8"))
yield item
else:
self.logger.info('Not found any allowed file type, lets try next page : %s', response.url)
# self.process_next_url(response)
yield Request(response.url, callback=self.parse, dont_filter=True)
然后输出将传递给 pipeline.py ,我将在PostgreSql中保存信息并下载文件
import json
import scrapy
import psycopg2
import datetime
import hashlib
import os
try:
import urllib.request as urllib2
except ImportError:
import urllib2
FILES_STORE = '<location to save files>'
class Pipeline(object):
def __init__(self):
self.conn = psycopg2.connect(user="postgres", password="pass",
dbname="db_name",
host='localhost')
def process_item(self, item, spider):
item['path'] = self.write_to_file(item['file_urls'])
cur = self.conn.cursor()
cur.execute('''
insert into scrape ( file_url, referer,path,created_date)
values (%s, %s,%s, %s);
''', [
item['file_urls'],
item['referer'],
item['path'],
datetime.datetime.now()])
self.conn.commit()
return item
def write_to_file(self, url):
response = urllib2.urlopen(url)
directory = FILES_STORE + str(hashlib.md5(url.encode('utf-8')).hexdigest()) + "/"
if not os.path.exists(directory):
os.makedirs(directory)
file_name = url.split('/')[-1]
with open(directory + str(file_name), "wb") as handle:
handle.write(response.read())
return directory + str(file_name)
希望这会有所帮助,欢呼(y)