仅抓取第一页的网页

时间:2020-04-23 03:26:04

标签: python

我正在尝试获取我的代码,以从该网站https://www.space.com/news上抓取前10条新闻文章的名称,例如,但它仅从第一页抓取该数据。我需要网络爬虫能够转到下一页并同时获取这些文章名称。不确定我是否在我的代码中缺少有关如何刮除第二页等内容的信息,直到完成循环?

from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from bs4 import BeautifulSoup
import requests


# define a function to process the page
def process_page(soup, articles):  

  # find all elements with class *summary*
  article_names = soup.select('.content')

  # same as above, extract the info we need
  for article in article_names:
    headline = article.select_one('header > h3').get_text() # extract headline title
    author = article.select_one('.byline > .by-author > span').get_text().strip() # extract author
    synopsis = article.select_one('.synopsis').get_text() # extract synopsis
    date_time = article.select_one('header > p > time') # extract time
    new_article = {'headline': headline, 'author': author,'synopsis': synopsis, 'date_time': date_time} # construct a dictionary
    articles.append(new_article)

  print(articles)

# prepare for the monitoring logic
start_time = time() # note the system time when the program starts
request_count = 0 # track the number of requests made

# create variables to store the data
articles = []

# variables to handle the request loop
has_next_page = True
MAX_REQUESTS = 10 # do not request more than 10 pages
page_number = 1
query = {'tab':'newest', 'page': page_number}
url = 'https://www.space.com/news'
headers = {'user-agent': 'articlescraper'}


while has_next_page and request_count < MAX_REQUESTS:
  # keep the output clear
  clear_output(wait = True)

  # make an initial request
  response = requests.get(url, params=query, headers=headers)

  # make sure we got a valid response
  if(response.ok):
    # get the full data from the response
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    process_page(soup, articles)

    # check for the next page
    # look for the presence of element with class *test-pagination-next*
    next_button = soup.select('a[span="Next"]')
    has_next_page = len(next_button) > 0

  else:
    # display a warning if there are any problems
    warn('Request #: {}, Failed with status code: {}'.format(request_count, response.status_code))

  request_count += 1

  # go to sleep for a bit
  # we use a random number between 1 and 5 so
  # We can wait as long as 5 seconds to make a second request

  sleep(randint(1,3))

  # output some logs for monitoring
  elapsed_time = time() - start_time
  print('Requests: {}, Frequency: {} requests/s, {} articles processed.'.format(request_count, request_count/elapsed_time, len(titles)))

  # prepare for next iteration
  page_number += 1


print('Sraping complete')
print('Requests: {}, Frequency: {} requests/s, {} articles processed.'.format(request_count, request_count/elapsed_time, len(titles)))

0 个答案:

没有答案