Question

我正在尝试从亚马逊根链接中抓取产品名称及其链接

root1.csv： https://www.amazon.com.au/s/gp/search/ref=sr_nr_p_85_0?fst=as%3Aoff&rh=n%3A4851510051%2Cp_4%3AThe+Gro+Company%2Cp_6%3AANEGB3WVEVKZB%2Cp_85%3A5444100051&bbn=4851510051&ie=UTF8&qid=1530074038&rnid=5444099051 https://www.amazon.com.au/s/gp/search/ref=sr_nr_p_6_0?fst=as%3Aoff&rh=n%3A4851510051%2Cp_4%3AES+Kids%2Cp_85%3A5444100051%2Cp_6%3AANEGB3WVEVKZB&bbn=4851510051&ie=UTF8&qid=1530074169&rnid=4910514051

我的脚本是：在下面

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
from io import StringIO
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
import csv
import time
import os
data = open("input/root1.csv", encoding='utf-8', errors='ignore').read()
dataFile = StringIO(data)
csvReader = csv.reader(dataFile)
for row in csvReader:
    data = []
    myurl = row[0]
    f = open('output/product_links1.csv', 'a', newline='', encoding='utf-8')
    writer = csv.writer(f)
    source_code = requests.get(myurl)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'lxml')
    agent = {'User-Agent': 'Magic Browser'}
    req1 = requests.get(myurl, headers=agent)
    soup2 = BeautifulSoup(req1.content, "html.parser")
    headers = {
        # 'cookie': '_ga=GA1.3.1576298877.1530030143; _gid=GA1.3.898834.1530030143; __lc.visitor_id.8904714=S1530030142.7515ce042a; sid_customer_0b4ee=8afef780e801184c0fb0e6eb689b52ca-1-C; roadblocked_at=1530030165529; nocache=1; lc_sso8904714=1530112813205; lc_window_state=minimized',
        'accept-encoding': 'text */*',
        'accept-language': 'en-US,en;q=0.9',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'referer': 'https://onlinelighting.com.au/lr-3704-adjustable-spot-light-rod-suspension-remote-transformer-43cm.html',
        'authority': 'onlinelighting.com.au',
        'x-requested-with': 'XMLHttpRequest',
    }
    agent1 = {'User-Agent': 'Mozilla/5.0 (iPad; U; CPU OS 3_2_1 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Mobile/7B405'}
    req = requests.get(myurl, headers=headers)
    soup1 = BeautifulSoup(req.content, "html.parser")
    #data.append(row[0])
    #print(soup)
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Firefox(executable_path='D:/Mine/Python/geckodriver.exe')
    #driver = webdriver.PhantomJS(executable_path='C:/Users/sk00425103/AppData/Local/Programs/Python/Python36-32/phantomjs-2.1.1-windows/bin/phantomjs')
    driver.get(myurl)
    src = driver.page_source
    parser = BeautifulSoup(src, "lxml")
    time.sleep(2)
    for head in parser.findAll('li', {"class": "s-result-item"}):
        for name in head.findAll('a', {"class": "s-access-detail-page"}):
            link0 = name.get("href")
            for name1 in name.findAll('h2'):
                title = name1.text
                data = [link0, title]
                print(data)
                writer.writerow(data)

    while True:
        nextpage = driver.find_element_by_id("pagnNextString")
        nextpage.click()
        time.sleep(2)
        src1 = driver.page_source
        parser1 = BeautifulSoup(src1, "lxml")
        time.sleep(2)
        for nx_head in parser1.findAll('li', {"class": "s-result-item"}):
            for nx_name in nx_head.findAll('a', {"class": "s-access-detail-page"}):
                nx_link0 = nx_name.get("href")
                for nx_name1 in nx_name.findAll('h2'):
                    nx_title = nx_name1.text
                    data = [nx_link0, nx_title]
                    print(data)
                    writer.writerow(data)
        time.sleep(2)

我想在到达最后一页（禁用NextPage链接）时停止while循环，它将使用root1.csv中的下一个根链接进行爬网

请对此提供帮助。

Answer 1

在while True循环中添加一个停止标志。

例如

cont = True
while cont:
  # Do a lot of stuff
  if last_page(): cont = False

如果您最好不要循环太多，也可以使用break。那是可以在上面的伪代码中使用，但是如果您需要在嵌套循环中进行检查，则break仅适用于内部循环，Python does not have labeled loops to allow labeled break/continue仅适用于内部循环。

通常，我建议清理代码并将其重构为类似这样的内容，而不是嵌套循环过多：

nextURL = firstURL
while nextURL:
  nextURL = processNextPage(nextURL)

到达最后一页时如何停止while循环

1 个答案: