import bs4
import requests
import re
r = requests.get('https://www.the961.com/latest-news/lebanon-news/').text
soup = bs4.BeautifulSoup(r, 'lxml')
for article in soup.find_all('article'):
title = article.h3.text
print(title)
date = article.find('span', class_='byline-part date')
if date: print('Date:', date.text)
author = article.find('span', class_="byline-part author")
if author: print('Author:', author.text)
link = article.find('h3', class_='title').a['href']
link_r = requests.get(link).text
soup_link = bs4.BeautifulSoup(link_r, 'lxml')
// 从标题中抓取链接,然后打开该链接并尝试抓取整篇文章,对此很陌生,所以我不知道该怎么做!
for article in soup_link.find_all('article'):
paragraph = article.find('p')
print(paragraph)
print()
答案 0 :(得分:2)
在某些页面上,<p>
标记不在 <article>
下,因此返回 None
。相反,要抓取所有段落(以及 <li>
标记,如果存在)使用以下 CSS 选择器:.entry-content > p, .entry-content li
。
要使用 CSS 选择器,请使用 .select()
方法而不是 .find_all()
。
在您的代码示例中:
import bs4
import requests
r = requests.get("https://www.the961.com/latest-news/lebanon-news/").text
soup = bs4.BeautifulSoup(r, "lxml")
for article in soup.find_all("article"):
title = article.h3.text
print(title)
date = article.find("span", class_="byline-part date")
if date:
print("Date:", date.text)
author = article.find("span", class_="byline-part author")
if author:
print("Author:", author.text, "\n")
link = article.find("h3", class_="title").a["href"]
link_r = requests.get(link).text
soup_link = bs4.BeautifulSoup(link_r, "lxml")
# Select all `p` tags (and `li`) under the class `entry-content`
for page in soup_link.select(".entry-content > p, .entry-content li"):
print(page.get_text(strip=True))
print("-" * 80)
print()