我已经编写了从soup.io下载图像的程序。
from bs4 import BeautifulSoup as soup
import urllib.request
import requests
import os.path
login = "test-site" #input('Enter soup login:')
website = "http://" + login + ".soup.io"
path = 'images'
if not os.path.exists(path):
os.makedirs(path)
openWebsite = soup(urllib.request.urlopen(website), 'html.parser')
imageLink = openWebsite.find_all(name="div", attrs={'class': 'imagecontainer'})
i = 1
for src in imageLink:
temp = src.find('img')['src']
img_data = requests.get(temp).content
if temp.find('.gif') != -1:
filename = os.path.join(path, str(i) + '.gif')
with open(filename, 'wb') as handler:
handler.write(img_data)
i += 1
elif temp.find('.jpeg') != -1:
filename = os.path.join(path, str(i) + '.jpeg')
with open(filename, 'wb') as handler:
handler.write(img_data)
i += 1
else:
filename = os.path.join(path, str(i) + '.png')
with open(filename, 'wb') as handler:
handler.write(img_data)
i += 1
nextPage = openWebsite.find_all(name="a", attrs={'class': 'more keephash'})
while str(nextPage):
for item in nextPage:
nextPageLink = website+item['href']
for j in nextPageLink:
openWebsite = soup(urllib.request.urlopen(nextPageLink), "html.parser")
imageLink = openWebsite.find_all(name="div", attrs={'class': 'imagecontainer'})
nextPage = openWebsite.find_all(name="a", attrs={'class': 'more keephash'})
for g in nextPage:
nextPageLink = website + g['href']
for src in imageLink:
temp = src.find('img')['src']
img_data = requests.get(temp).content
if temp.find('.gif') != -1:
filename = os.path.join(path, str(i) + '.gif')
with open(filename, 'wb') as handler:
handler.write(img_data)
i += 1
elif temp.find('.jpeg') != -1:
filename = os.path.join(path, str(i) + '.jpeg')
with open(filename, 'wb') as handler:
handler.write(img_data)
i += 1
else:
filename = os.path.join(path, str(i) + '.png')
with open(filename, 'wb') as handler:
handler.write(img_data)
i += 1
每页显示20张图片。在每一页我都在刮擦"更多"链接到旧页面(nextPageLink),并在每个图像保存在循环中后打开它。我的问题是我的程序循环在最后一页(哪里不是"更多"链接)并从那里一遍又一遍地下载图像。我尝试将nextPageLink分配给一个名为previousPage的新变量,然后使用if语句进行比较 - 如果链接相同,我想设置nextPage = False,但它不起作用 - nextPageLink不再更新,因为网站上没有链接,所以我无法正确比较它。
答案 0 :(得分:1)
正如@Marc B所说,我的问题是我没有检查nextPage是否为空。所以解决方案很简单:
if openWebsite.find_all(name="a", attrs={'class': 'more keephash'}) == []:
break