我只开始学习python几个月了,所以我对此仍然很陌生。我一直在尝试进行网络抓取,并想通过搜索引擎尝试一下。我能够从单个页面成功提取数据,但是当涉及到使用“下一个”按钮时,我似乎无法摆脱while循环。另外,我在获取附加列表项进行请求时遇到了麻烦。任何帮助或指导将不胜感激。
import requests
from bs4 import BeautifulSoup
import time
def Scraper(search_term, number_results,language_code):
while True:
time.sleep(5)
base_url = 'https://www.example.com'
index_url = 'https://www.example.com/search?q={}&num={}&hl={}'.format(search_term, number_results, language_code)
next_urls=[index_url]
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
def scrapePage():
for item in next_urls:
response = requests.get(item)
response.raise_for_status()
soup = BeautifulSoup(response.text,'html.parser')
contents= soup.find_all('li',attrs={'class':'b_algo'})
for r in contents:
anchor_t = r.find('h2',)
l_title = anchor_t.find('a')
title = l_title.get_text()
link = l_title['href']
description = r.find('p')
descriptions = description.get_text()
nextUrl = soup.find('a', class_='sb_pagN', href=True)
nextBtn = nextUrl['href']
page = f"{base_url}{nextBtn}"
next_urls.append(page)
print(f"\n{title}\n{descriptions}\n{link}")
return next_urls
for item in next_urls:
res = requests.get(item)
soup = BeautifulSoup(res.text,'html.parser')
contents= soup.find_all('li',attrs={'class':'b_algo'})
nextUrl = soup.find('a', class_='sb_pagN', href=True)
nextBtn = nextUrl['href']
page = f"{base_url}{item}"
next_urls.append(page)
scrapePage()
print(f'\n{next_urls}')
Scraper('f150 rapter',100,'en')