当页码位于URL中时,如何刮取多个页面?
例如:
https://www.cars.com/for-sale/searchresults.action/?mdId=21811&mkId=20024&page=**1**&perPage=100&rd=99999&searchSource=PAGINATION&showMore=false&sort=relevance&stkTypId=28880&zc=11209
我的代码:
import requests
from bs4 import BeautifulSoup
from csv import writer
response = requests.get('https://www.cars.com/for-sale/searchresults.action/?mdId=21811&mkId=20024&page=1&perPage=100&rd=99999&searchSource=PAGINATION&showMore=false&sort=relevance&stkTypId=28880&zc=11209')
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all(class_='shop-srp-listings__inner')
with open('posts.csv', 'w') as csv_file:
csv_writer = writer(csv_file)
headers = ['title', 'color', 'price']
csv_writer.writerow(headers)
for post in posts:
title = post.find(class_="listing-row__title").get_text().replace('\n', '').strip()
# color = post.find("li").get_text().replace('\n', '')
price = post.find("span", attrs={"class": "listing-row__price"}).get_text().replace('\n', '').strip()
print(title, price)
# csv_writer.writerow([title, color, price])
感谢您的帮助
答案 0 :(得分:1)
page = 0
for x in range(25):
page+=1
url = ('https://www.cars.com/for-sale/searchresults.action/?mdId=21811&mkId=20024&page='+str(page)+
'&perPage=100&rd=99999&searchSource=PAGINATION&showMore=false&sort=relevance&stkTypId=28880&zc=11209')
print(url)
#requests.get(url)
答案 1 :(得分:0)
获取页面总数并重复访问每个页面。
import requests
from bs4 import BeautifulSoup
from csv import writer
with open('posts.csv', 'w') as csv_file:
csv_writer = writer(csv_file)
headers = ['title', 'color', 'price']
csv_writer.writerow(headers)
response = requests.get('https://www.cars.com/for- sale/searchresults.action/?mdId=21811&mkId=20024&page=1&perPage=100&rd=99999&searchSource=PAGINATION&showMorefalse&sort=relevance&stkTypId=28880&zc=11209')
soup = BeautifulSoup(response.text, 'html.parser')
number_of_pages = soup.find_all(class_='js-last-page')
for page in range(1, number_of_pages+1):
response = requests.get('https://www.cars.com/for- sale/searchresults.action/?mdId=21811&mkId=20024&page='+ str(page)+'&perPage=100&rd=99999&searchSource=PAGINATION&showMorefalse&sort=relevance&stkTypId=28880&zc=11209')
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all(class_='shop-srp-listings__inner')
for post in posts:
title = post.find(class_="listing-row__title").get_text().replace('\n', '').strip()
# color = post.find("li").get_text().replace('\n', '')
price = post.find("span", attrs={"class": "listing-row__price"}).get_text().replace('\n', '').strip()
print(title, price)
# csv_writer.writerow([title, color, price])