Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/pipelines/media.py", line 44, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/lib/pymodules/python2.7/scrapy/pipelines/images.py", line 109, in get_media_requests
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
File "/usr/lib/pymodules/python2.7/scrapy/http/request/__init__.py", line 24, in __init__
self._set_url(url)
File "/usr/lib/pymodules/python2.7/scrapy/http/request/__init__.py", line 59, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
我想为每个页面获取subject_url,然后为每个主题获取image_url。 我的意思是我想从每个主题下载每个想象,每页有很多页面和许多主题。 我鼓励每个网址都没问题 我的scrapy代码在这里:
class CaoliuSpider(scrapy.spiders.Spider):
name = 'Caoliu'
allowed_dimains = ['bearhk.pw']
start_urls = []
base_url = "http://www.examples.com/"
for i in range(20):
url = "http://www.examples.com?&page=" + str(i+1)
start_urls.append(url)
def parse(self, response):
selector = Selector(response)
urls = [ urlparse.urljoin(self.base_url, str(i)) for i in Selector(response=response).xpath('//*[@class="tr3 t_one"]/td/h3/a/@href').extract()]
for url in urls:
yield Request(url, callback=self.saveimg)
def saveimg(self, response):
selector = Selector(response)
imgs_urls = Selector(response=response).xpath('//*/input[@type="image"]/@src').extract()
item = CaoliuItem()
for url in imgs_urls:
item['image_urls'] = str(url)
item['images'] = url
print item['image_urls']
yield item
答案 0 :(得分:0)
当您在url
字段中添加image_urls
时,请使用urljoin()
:
item['image_urls'] = urlparse.urljoin(self.base_url, url)
此外,allowed_dimains
应为allowed_domains
。
答案 1 :(得分:0)
image_urls应该是list,而不是str。
Scrapy book的例子:
image_paths = [x[’path’] for ok, x in results if ok]