我正在使用Scrapy从http://www.vesselfinder.com/vessels
下载图片但是,我只能获得像http://www.vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0
这样的图像的相对网址所有图片都名为0.jpg,但如果我尝试使用该绝对网址,则无法访问该图片。
我的代码: 的 items.py
import scrapy
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class VesselPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
vessel_spider.py
import scrapy
import string
from vessel.items import VesselItem
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[@class="items"]/article'):
item = VesselItem()
imageStr = str(sel.xpath('div[1]/a/picture/img/@src').extract())
item['image_urls'] = self.page_name + imageStr[3:-2]
nameStr = str(sel.xpath('div[2]/header/h1/a/text()').extract())
item['name'] = nameStr[19:-8]
typeStr = str(sel.xpath('div[2]/div[2]/div[2]/text()').extract())
item['type'] = typeStr[3:-2]
return item
当我运行这个蜘蛛时,我收到了exceptions.ValueError: Missing scheme in request url: h
错误,因为我没有提供绝对网址。
[vessel] ERROR: Error processing {'image_urls': 'http://vesselfinder.com/vessels/ship-photo/0-224138470-a2fdc783d05a019d00ad9db0cef322f7/0.jpg',
'name': 'XILGARO ALEANTE',
'type': 'Sailing vessel'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 383, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 491, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 578, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/media.py", line 40, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/images.py", line 104, in get_media_requests
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 26, in __init__
self._set_url(url)
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
我该如何解决这个问题。是否有任何特殊的方式来获取像这样的网站的图像(或其绝对网址)。
答案 0 :(得分:1)
将图片网址包装在如下列表中:
item['image_urls'] = [self.page_name + imageStr[3:-2]]
答案 1 :(得分:0)
我认为以下代码可以解决问题(对代码的更改很少),
<强> vessel_spider.py 强>
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[@class="items"]/article'):
item = VesselItem()
imageStr = sel.xpath('./div[@class="small-12 medium-5 large-5 columns"]/a/picture/img/@src').extract()
imageStr = imageStr[0] if imageStr else 'N/A'
item['image_urls'] = [self.page_name + imageStr]
nameStr = sel.xpath('./div/header/h1[@class="subheader"]/a/text()').extract()
nameStr = ' '.join(' '.join(nameStr).split()) if nameStr else 'N/A'
item['name'] = nameStr
typeStr = sel.xpath('.//div[@class="small-4 columns" and contains(text(), "Ship type")]/following-sibling::div/text()').extract()
typeStr = typeStr[0].strip() if typeStr else 'N/A'
item['ship_type'] = typeStr
yield item
<强> items.py 强>
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
ship_type = scrapy.Field()
附加示例输出
{'image_urls': u'http://vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0',
'name': u'IBTISAM ATAO',
'ship_type': u'Sailing vessel'}