网站https://www.nike.com/w/mens-shoes-nik1zy7ok进行鞋子的抓取。现在,我可以检索最初加载的鞋子,以及当您滚动到带有以下代码的下一页时加载的鞋子:
import re
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.nike.com/gb/w/womens-shoes-5e1x6zy7ok'
html_data = requests.get(url).text
data = json.loads(re.search(r'window.INITIAL_REDUX_STATE=(\{.*?\});', html_data).group(1))
for p in data['Wall']['products']:
print(p['title'])
print(p['subtitle'])
print(p['price']['currentPrice'], p['price']['currency'])
print(p['colorways'][0]['images']['portraitURL'].replace('w_400', 'w_1920'))
print('-' * 120)
next_page = data['Wall']['pageData']['next']
while next_page:
u = 'https://www.nike.com' + next_page
data = requests.get(u).json()
for o in data['objects']:
p = o['productInfo'][0]
print(p['productContent']['title'])
print(p['productContent']['subtitle'])
print(p['merchPrice']['currentPrice'], p['merchPrice']['currency'])
print(p['imageUrls']['productImageUrl'])
print('-' * 120)
next_page = data.get('pages', {'next':''})['next']
如何将所有这些鞋子附加在一起以形成字典,可以使用以下命令打印结果:
{% for shoe in shoes['Wall']['products'] %}
<p>{{shoe}}</p>
<h2>New shoe</h2>
{% endfor %}
答案 0 :(得分:0)
这是一个递归生成器函数,可以在紧要关头完成工作,但是有点混乱。确实需要更多的内容来作为生产代码,例如处理请求错误等,但是它应该使您朝着正确的方向前进。如果有什么令人困惑的地方,请务必提出问题,这里有一些难以理解的概念。
import re
import json
import requests
def get_shoes(url="https://www.nike.com/", path=None):
response = requests.get(f"{url}{path}")
try:
data = response.json()
products = (
{
"title": p["productContent"]["title"],
"subtitle": p["productContent"]["subtitle"],
"price": p["merchPrice"]["currentPrice"],
"currency": p["merchPrice"]["currency"],
"image_url": p["imageUrls"]["productImageUrl"],
}
for i in data["objects"]
for p in i["productInfo"]
)
next_page = data.get("pages", {"next": ""})["next"]
except json.JSONDecodeError:
data = json.loads(
re.search(r"window.INITIAL_REDUX_STATE=(\{.*?\});", response.text).group(1)
)
products = (
{
"title": p["title"],
"subtitle": p["subtitle"],
"price": p["price"]["currentPrice"],
"currency": p["price"]["currency"],
"image_url": p["colorways"][0]["images"]["portraitURL"].replace(
"w_400", "w_1920"
),
}
for p in data["Wall"]["products"]
)
next_page = data["Wall"]["pageData"]["next"]
for product in products:
yield product
if next_page:
yield from get_shoes(url, next_page)
shoes = {}
for shoe in get_shoes(path="gb/w/womens-shoes-5e1x6zy7ok"):
print(shoe["title"])
print(shoe["subtitle"])
print(shoe["price"], shoe["currency"])
print(shoe["image_url"])
print("-" * 120)