我要在抓取多个页面后下载图像。但是,所有图像均无法下载,因为它们已被[用于语法]覆盖。
下面是我的代码。怎么了?
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as rq
for page in range(2,4):
baseUrl = 'https://onepiecetreasurecruise.fr/Artwork/index.php?page=index'
plusUrl = baseUrl + str(page)
html = urlopen(plusUrl).read()
soup = BeautifulSoup(html, 'html.parser')
img = soup.find_all(class_='card-img-top')
listimg = []
for i in img:
listimg.append(i['src'])
n = 1
for index, img_link in enumerate(listimg):
img_data = rq.get(img_link).content
with open('./onepiece/' + str(index+1) + '.png', 'wb+') as f:
f.write(img_data)
n += 1
答案 0 :(得分:0)
我修复了代码中的缩进。这对我有用。下载30张图片。
useHotels
答案 1 :(得分:0)
另一种方法是下载所有图片。
from simplified_scrapy import Spider, SimplifiedDoc, utils, SimplifiedMain
class ImageSpider(Spider):
name = 'onepiecetreasurecruise'
start_urls = ['https://onepiecetreasurecruise.fr/Artwork/index.php?page=index']
# refresh_urls = True
concurrencyPer1s = 0.5 # set download speed
imgPath = 'images/'
def __init__(self):
Spider.__init__(self, self.name) # necessary
utils.createDir(self.imgPath) # create image dir
def afterResponse(self, response, url, error=None, extra=None):
try: # save images
flag = utils.saveResponseAsFile(response, self.imgPath, 'image')
if flag: return None
except Exception as err:
print(err)
return Spider.afterResponse(self, response, url, error, extra)
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
# image urls
urls = doc.body.getElements('p', value='card-text').a
if (urls):
for u in urls:
u['header']={'Referer': url['url']}
self.saveUrl(urls)
# next page urls
u = doc.body.getElementByText('Suivant',tag='a')
if (u):
u['href'] = utils.absoluteUrl(url.url,u.href)
self.saveUrl(u)
return True
SimplifiedMain.startThread(ImageSpider()) # start download