我正在尝试获取我的代码,以从该网站https://www.space.com/news上抓取前10条新闻文章的名称,例如,但它仅从第一页抓取该数据。我需要网络爬虫能够转到下一页并同时获取这些文章名称。不确定我是否在我的代码中缺少有关如何刮除第二页等内容的信息,直到完成循环?
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from bs4 import BeautifulSoup
import requests
# define a function to process the page
def process_page(soup, articles):
# find all elements with class *summary*
article_names = soup.select('.content')
# same as above, extract the info we need
for article in article_names:
headline = article.select_one('header > h3').get_text() # extract headline title
author = article.select_one('.byline > .by-author > span').get_text().strip() # extract author
synopsis = article.select_one('.synopsis').get_text() # extract synopsis
date_time = article.select_one('header > p > time') # extract time
new_article = {'headline': headline, 'author': author,'synopsis': synopsis, 'date_time': date_time} # construct a dictionary
articles.append(new_article)
print(articles)
# prepare for the monitoring logic
start_time = time() # note the system time when the program starts
request_count = 0 # track the number of requests made
# create variables to store the data
articles = []
# variables to handle the request loop
has_next_page = True
MAX_REQUESTS = 10 # do not request more than 10 pages
page_number = 1
query = {'tab':'newest', 'page': page_number}
url = 'https://www.space.com/news'
headers = {'user-agent': 'articlescraper'}
while has_next_page and request_count < MAX_REQUESTS:
# keep the output clear
clear_output(wait = True)
# make an initial request
response = requests.get(url, params=query, headers=headers)
# make sure we got a valid response
if(response.ok):
# get the full data from the response
data = response.text
soup = BeautifulSoup(data, 'html.parser')
process_page(soup, articles)
# check for the next page
# look for the presence of element with class *test-pagination-next*
next_button = soup.select('a[span="Next"]')
has_next_page = len(next_button) > 0
else:
# display a warning if there are any problems
warn('Request #: {}, Failed with status code: {}'.format(request_count, response.status_code))
request_count += 1
# go to sleep for a bit
# we use a random number between 1 and 5 so
# We can wait as long as 5 seconds to make a second request
sleep(randint(1,3))
# output some logs for monitoring
elapsed_time = time() - start_time
print('Requests: {}, Frequency: {} requests/s, {} articles processed.'.format(request_count, request_count/elapsed_time, len(titles)))
# prepare for next iteration
page_number += 1
print('Sraping complete')
print('Requests: {}, Frequency: {} requests/s, {} articles processed.'.format(request_count, request_count/elapsed_time, len(titles)))