Question

我已经编写了从soup.io下载图像的程序。

from bs4 import BeautifulSoup as soup
import urllib.request
import requests
import os.path

login = "test-site" #input('Enter soup login:')
website = "http://" + login + ".soup.io"
path = 'images'

if not os.path.exists(path):
    os.makedirs(path)

openWebsite = soup(urllib.request.urlopen(website), 'html.parser')
imageLink = openWebsite.find_all(name="div", attrs={'class': 'imagecontainer'})

i = 1
for src in imageLink:
    temp = src.find('img')['src']
    img_data = requests.get(temp).content
    if temp.find('.gif') != -1:
        filename = os.path.join(path, str(i) + '.gif')
        with open(filename, 'wb') as handler:
            handler.write(img_data)
        i += 1
    elif temp.find('.jpeg') != -1:
        filename = os.path.join(path, str(i) + '.jpeg')
        with open(filename, 'wb') as handler:
            handler.write(img_data)
        i += 1
    else:
        filename = os.path.join(path, str(i) + '.png')
        with open(filename, 'wb') as handler:
            handler.write(img_data)
        i += 1

nextPage = openWebsite.find_all(name="a", attrs={'class': 'more keephash'})

while str(nextPage):
    for item in nextPage:
        nextPageLink = website+item['href']

        for j in nextPageLink:
            openWebsite = soup(urllib.request.urlopen(nextPageLink), "html.parser")
            imageLink = openWebsite.find_all(name="div", attrs={'class': 'imagecontainer'})
            nextPage = openWebsite.find_all(name="a", attrs={'class': 'more keephash'})

            for g in nextPage:
                nextPageLink = website + g['href']

            for src in imageLink:
                temp = src.find('img')['src']
                img_data = requests.get(temp).content
                if temp.find('.gif') != -1:
                    filename = os.path.join(path, str(i) + '.gif')
                    with open(filename, 'wb') as handler:
                        handler.write(img_data)
                    i += 1
                elif temp.find('.jpeg') != -1:
                    filename = os.path.join(path, str(i) + '.jpeg')
                    with open(filename, 'wb') as handler:
                        handler.write(img_data)
                    i += 1
                else:
                    filename = os.path.join(path, str(i) + '.png')
                    with open(filename, 'wb') as handler:
                        handler.write(img_data)
                    i += 1

每页显示20张图片。在每一页我都在刮擦＆＃34;更多＆＃34;链接到旧页面（nextPageLink），并在每个图像保存在循环中后打开它。我的问题是我的程序循环在最后一页（哪里不是＆＃34;更多＆＃34;链接）并从那里一遍又一遍地下载图像。我尝试将nextPageLink分配给一个名为previousPage的新变量，然后使用if语句进行比较 - 如果链接相同，我想设置nextPage = False，但它不起作用 - nextPageLink不再更新，因为网站上没有链接，所以我无法正确比较它。

Answer 1

正如@Marc B所说，我的问题是我没有检查nextPage是否为空。所以解决方案很简单：

if openWebsite.find_all(name="a", attrs={'class': 'more keephash'}) == []:
   break

退出无限循环

1 个答案: