我需要创建新闻数据集。我需要提取给定新闻网站上曾经发布过的所有新闻。我已经编写了这段代码
import requests
from bs4 import BeautifulSoup
import pandas
import csv
from datetime import datetime
records=[]
def cnbc(base_url):
r = requests.get(base_url)
c = r.content
soup = BeautifulSoup(c,"html.parser")
Title=soup.find("h1","class":"title"}).text.replace("\r","").replace("\n","")
content=' '
for content_tag in soup.find_all("p"):
content = content+content_tag.text.replace("\r","").replace("\n","")
content= content[18:-458]
Country ='United States'
website='https://www.cnbc.com/'
comments=''
genre='Political'
date= soup.find("time",{"class":"datestamp"}).text[35:-2].replace("\r","").replace("\n","")
d = datetime.strptime(date, "%d %b %Y")
date = d.strftime("%d-%m-%Y")
records.append((Title,content,Country,website,comments,genre,date))
cnbc("https://www.cnbc.com/2018/11/02/here-are-the-three-things-pulling-down-the-stock-market-again.html")
但这只允许我提取一个新闻。
谁能告诉我如何从网站的根目录中提取所有新闻URL。
答案 0 :(得分:0)
这是python3脚本,它并非完美无缺,但我希望它可以作为起点,以便您可以实现自己想要的目标。我不确定您要从中抓取数据的站点是否允许这种操作,因此我不会为常量WEB_SITE_BASE_URL和WEB_SITE_REGION_URL放置其网址。您将选择放置在那里的东西。
import requests
from bs4 import BeautifulSoup
from datetime import datetime
# https://www.xxxx.com"
WEB_SITE_BASE_URL= ""
# https://www.xxxx.com/?region=us
WEB_SITE_REGION_URL = ""
def get_categories(web_site_base_url):
r = requests.get(web_site_base_url)
c = r.content
soup = BeautifulSoup(c,"html.parser")
spans = soup.find_all(attrs={"nav-menu-buttonText"})
categories = [category.text for category in spans]
return categories
def get_links(category_url):
r = requests.get(category_url)
c = r.content
soup = BeautifulSoup(c,"html.parser")
links = [a.get('href') for a in soup.find_all('a', href=True)]
filtered_links = list(set([k for k in links if '/2018/11/' in k]))
return filtered_links
def news(link):
r = requests.get(link)
c = r.content
soup = BeautifulSoup(c,"html.parser")
Title=soup.find("h1",{"class":"title"}).text.replace("\r","").replace("\n","")
content=' '
for content_tag in soup.find_all("p"):
content = content+content_tag.text.replace("\r","").replace("\n","")
content= content[18:-458]
Country ='United States'
website='WEB_SITE_BASE_URL'
comments=''
date= soup.find("time",{"class":"datestamp"}).text[35:-2].replace("\r","").replace("\n","")
d = datetime.strptime(date, "%d %b %Y")
date = d.strftime("%d-%m-%Y")
spans = soup.find_all(attrs={"header_title last breadcrumb"})
categories = [category.text for category in spans]
genre = categories
return(Title,content,Country,website,comments,genre,date)
categories = get_categories(WEB_SITE_REGION_URL)
list_of_link_lists = []
for category in categories:
list_of_link_lists.append(get_links(WEB_SITE_BASE_URL + "/" + category.replace(" ", "20%")))
flat_link_list = list(set([item for sublist in list_of_link_lists for item in sublist]))
articles_list = []
for link in flat_link_list:
try:
articles_list.append(news(WEB_SITE_BASE_URL + link))
except:
print("Something was wrong")
continue
print(articles_list)
答案 1 :(得分:0)
有一个大致的方法可以提取所有新闻的一部分,该方法显示为我的代码。首先,提取所有div
类为标题news_headline = soup.find_all("div",class_="headline")
的新闻。然后检查element是否就是我们想要的。
new = []
for div in news_headline:
each = ()
if div.a:
each[0] = url + div.a.get("href")
if div.a.text:
# use split to remove \t \n blankspace
each[1] = " ".join(div.a.text.split())
else:
each[1] = " ".join(div.a.get("title").split())
new.append(each)
else:
continue
这是完整的代码,但是我写得尽可能短。
import requests
from bs4 import BeautifulSoup
def index(url="https://www.cnbc.com/world/"):
with requests.Session() as se:
se.encoding = "UTF-8"
res = se.get(url)
text = res.text
soup = BeautifulSoup(text,"lxml")
news_headline = soup.find_all("div",class_="headline")
news_ = [(url + div.a.get("href"), " ".join(div.a.text.split()) if div.a.text else "".join(div.a.get("title").split()) ) for div in news_headline if div.a]
print(news_)
index()