我正试图从网站https://www.rithmschool.com/blog抓取网页。虽然第一页的内容正在被删除,但我的代码存在问题 - 即使是网站的所有其他页面,也会删除相同的内容。下面是代码。 任何人都可以帮我修复代码。 我的代码
import requests
from bs4 import BeautifulSoup
from csv import writer
html = requests.get('https://www.rithmschool.com/blog')
soup = BeautifulSoup(html.text, 'html.parser')
articles = soup.find_all('article')
with open('scraped_rithm.csv', 'w') as f:
data = writer(f)
data.writerow(['Title','Link','Date'])
for article in articles:
title = article.find('a').get_text()
link = article.find('a')['href']
date = article.find('time')['datetime']
data.writerow([title,link,date])
spans = soup.find_all('span', {'class' : 'page'})
for span in spans:
if span.find('a'):
urls = ((span.find('a')['href'])).split(',')
for url in urls:
nw_urls = (f"https://www.rithmschool.com{url}")
print(nw_urls)
nw_response = requests.get(nw_urls)
nw_soup = BeautifulSoup(nw_response.text,'html.parser')
articles = soup.find_all('article')
for article in articles:
title = article.find('a').get_text()
link = article.find('a')['href']
date = article.find('time')['datetime']
data.writerow([title,link,date])
答案 0 :(得分:0)
创建新的soup
对象
nw_soup
对象
尝试从
更改此内容 nw_soup = BeautifulSoup(nw_response.text,'html.parser')
articles = soup.find_all('article')
到
nw_soup = BeautifulSoup(nw_response.text,'html.parser')
articles = nw_soup.find_all('article')