我正在努力地做以下事情。
<a>
的href的更多页面中抓取JPG图片
这是代码:
from spiderStudy.items import Images
import scrapy
class MySpider(scrapy.Spider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/"
]
def parse(self, response):
for sel in response.css('a'):
c = sel.xpath('@href').extract_first()
if (c is not None) and ('void' not in c) and (c not in self.start_urls):
# parse image
imgs = response.css('img')
for img in imgs:
d = img.xpath("@src").extract_first()
if d and d.endswith('.jpg'):
d = response.urljoin(d)
yield {'image_urls': [d]}
# more urls
url = response.urljoin(c)
yield scrapy.Request(url, callback=self.parse)
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class SpiderstudyPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
搜寻器确实搜寻了所有页面的所有JPG图像,但是随后卡住了,程序无法正常停止。
我不知道是什么原因导致程序卡死,也许是因为我使用了多个'yield's?
请帮助我。