我正在尝试使用Selenium和BeautifulSoup对具有“加载更多”按钮的网站进行网络抓取。我已经获得了脚本来单击“加载更多”按钮并加载其余内容,但是在将内容刮入json文件时遇到了问题。这是我的剧本
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
url = "https://smarthistory.org/americas-before-1900/"
driver = webdriver.Chrome('/Users/rawlins/Downloads/chromedriver')
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0
while driver.find_elements_by_css_selector('#load-more-cc-objects'):
driver.find_element_by_css_selector('#load-more-cc-objects').click()
page_num += 1
print("getting page number "+str(page_num))
time.sleep(1)
html = driver.page_source.encode('utf-8')
data = []
# Parse HTML, close browser
page_soup = soup(driver.page_source, 'lxml')
containers = page_soup.findAll("div", {"class":"mb-8 hover-zoom tablescraper-selected-row opacity-100"})
for container in containers:
item = {}
item['type'] = "Course Material"
item['title'] = container.find('h5', {'class' : 'm-0 mt-4 text-grey-darker text-normal leading-tight hover-connect'}).text.strip()
item['link'] = container.a["href"]
item['source'] = "Smarthistory"
item['base_url'] = "https://smarthistory.org"
item['license'] = "Attribution-NonCommercial-ShareAlike"
data.append(item) # add the item to the list
with open("smarthistory-2.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
browser.quit()
我的预期输出是这样
[
{
"type": "Course Material",
"title": "Impressionism as optical realism: Monet",
"link": "https://smarthistory.org/impressionism-optical-realism-monet/",
"source": "Smarthistory",
"base_url": "https://smarthistory.org",
"license": "Attribution-NonCommercial-ShareAlike"
},
{
"type": "Course Material",
"title": "Impressionism: painting modern life",
"link": "https://smarthistory.org/painting-modern-life/",
"source": "Smarthistory",
"base_url": "https://smarthistory.org",
"license": "Attribution-NonCommercial-ShareAlike"
}
]
答案 0 :(得分:0)
使用Google Chrome的开发工具(F12)时,您可以检查网络流量。
只需在网站上转到DEV工具内的“网络”标签,然后单击“加载更多”按钮即可。
您应该会在列表中看到一个请求(object?tag=DDD&page=2
)。
在循环内使用请求URL遍历页面。
通过这种方式,您无需单击按钮即可直接获取JSON。
答案 1 :(得分:0)
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from time import sleep
from random import randint
import json
url = "https://smarthistory.org/americas-before-1900/"
driver = webdriver.Chrome('chromedriver')
driver.get(url)
html = driver.page_source.encode('utf-8')
page_num = 0
########
try:
driver.find_elements_by_css_selector('#load-more-cc-objects')
driver.find_element_by_css_selector('#load-more-cc-objects').click()
except:
pass
while True:
try:
driver.find_elements_by_css_selector('#load-more-cc-objects')
driver.find_element_by_css_selector('#load-more-cc-objects').click()
page_num += 1
print("getting page number "+str(page_num))
#sleep(randint(2, 10))
time.sleep(1)
except:
print("Reached bottom of page")
break
##########
html = driver.page_source.encode('utf-8')
data = []
# Parse HTML, close browser
page_soup = soup(driver.page_source, 'lxml')
containers = page_soup.findAll(
"div", {"class": "mb-8 hover-zoom"})
for container in containers:
item = {}
item['type'] = "Course Material"
item['title'] = container.find(
'h5', {'class': 'm-0 mt-4 text-grey-darker text-normal leading-tight hover-connect'}).text.strip()
item['link'] = container.a["href"]
item['source'] = "Smarthistory"
item['base_url'] = "https://smarthistory.org"
item['license'] = "Attribution-NonCommercial-ShareAlike"
data.append(item) # add the item to the list
with open("smarthistory-2.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
driver.quit()