Question

我正在尝试使用Selenium和BeautifulSoup对具有“加载更多”按钮的网站进行网络抓取。我已经获得了脚本来单击“加载更多”按钮并加载其余内容，但是在将内容刮入json文件时遇到了问题。这是我的剧本

from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time

url = "https://smarthistory.org/americas-before-1900/"
driver = webdriver.Chrome('/Users/rawlins/Downloads/chromedriver')
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0

while driver.find_elements_by_css_selector('#load-more-cc-objects'):
    driver.find_element_by_css_selector('#load-more-cc-objects').click()
    page_num += 1
    print("getting page number "+str(page_num))
    time.sleep(1)

html = driver.page_source.encode('utf-8')

data = [] 

# Parse HTML, close browser
page_soup = soup(driver.page_source, 'lxml')
containers = page_soup.findAll("div", {"class":"mb-8 hover-zoom tablescraper-selected-row opacity-100"})

for container in containers:
    item = {}
    item['type'] = "Course Material"
    item['title'] = container.find('h5', {'class' : 'm-0 mt-4 text-grey-darker text-normal leading-tight hover-connect'}).text.strip()
    item['link'] = container.a["href"]
    item['source'] = "Smarthistory"
    item['base_url'] = "https://smarthistory.org"
    item['license'] = "Attribution-NonCommercial-ShareAlike"
    data.append(item) # add the item to the list

with open("smarthistory-2.json", "w") as writeJSON:
    json.dump(data, writeJSON, ensure_ascii=False)

browser.quit()

我的预期输出是这样

[
    {
        "type": "Course Material",
        "title": "Impressionism as optical realism: Monet",
        "link": "https://smarthistory.org/impressionism-optical-realism-monet/",
        "source": "Smarthistory",
        "base_url": "https://smarthistory.org",
        "license": "Attribution-NonCommercial-ShareAlike"
    },
    {
        "type": "Course Material",
        "title": "Impressionism: painting modern life",
        "link": "https://smarthistory.org/painting-modern-life/",
        "source": "Smarthistory",
        "base_url": "https://smarthistory.org",
        "license": "Attribution-NonCommercial-ShareAlike"
    }
]

Answer 1

使用Google Chrome的开发工具（F12）时，您可以检查网络流量。只需在网站上转到DEV工具内的“网络”标签，然后单击“加载更多”按钮即可。您应该会在列表中看到一个请求（object?tag=DDD&page=2）。在循环内使用请求URL遍历页面。通过这种方式，您无需单击按钮即可直接获取JSON。

Answer 2

from bs4 import BeautifulSoup as soup
from selenium import webdriver
from time import sleep
from random import randint
import json

url = "https://smarthistory.org/americas-before-1900/"
driver = webdriver.Chrome('chromedriver')
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0

########
try:
    driver.find_elements_by_css_selector('#load-more-cc-objects')
    driver.find_element_by_css_selector('#load-more-cc-objects').click()
except:
    pass

while True:
    try:
        driver.find_elements_by_css_selector('#load-more-cc-objects')
        driver.find_element_by_css_selector('#load-more-cc-objects').click()
        page_num += 1
        print("getting page number "+str(page_num))

        #sleep(randint(2, 10))
        time.sleep(1)

    except:
        print("Reached bottom of page")
        break
##########




html = driver.page_source.encode('utf-8')


data = []

# Parse HTML, close browser
page_soup = soup(driver.page_source, 'lxml')
containers = page_soup.findAll(
    "div", {"class": "mb-8 hover-zoom"})

for container in containers:
    item = {}
    item['type'] = "Course Material"
    item['title'] = container.find(
        'h5', {'class': 'm-0 mt-4 text-grey-darker text-normal leading-tight hover-connect'}).text.strip()
    item['link'] = container.a["href"]
    item['source'] = "Smarthistory"
    item['base_url'] = "https://smarthistory.org"
    item['license'] = "Attribution-NonCommercial-ShareAlike"
    data.append(item)  # add the item to the list

with open("smarthistory-2.json", "w") as writeJSON:
    json.dump(data, writeJSON, ensure_ascii=False)

driver.quit()

使用“加载更多”按钮对网站进行爬网

2 个答案: