使用python请求从网站上抓取信息

时间:2020-07-20 03:13:47

标签: python web-scraping beautifulsoup python-requests jinja2

网站https://www.nike.com/w/mens-shoes-nik1zy7ok进行鞋子的抓取。现在,我可以检索最初加载的鞋子,以及当您滚动到带有以下代码的下一页时加载的鞋子:

import re
import json
import requests
from bs4 import BeautifulSoup


url = 'https://www.nike.com/gb/w/womens-shoes-5e1x6zy7ok'
html_data = requests.get(url).text
data = json.loads(re.search(r'window.INITIAL_REDUX_STATE=(\{.*?\});', html_data).group(1))

for p in data['Wall']['products']:
    print(p['title'])
    print(p['subtitle'])
    print(p['price']['currentPrice'], p['price']['currency'])
    print(p['colorways'][0]['images']['portraitURL'].replace('w_400', 'w_1920'))
    print('-' * 120)

next_page = data['Wall']['pageData']['next']
while next_page:
    u = 'https://www.nike.com' + next_page

    data = requests.get(u).json()
    for o in data['objects']:
        p = o['productInfo'][0]
        print(p['productContent']['title'])
        print(p['productContent']['subtitle'])
        print(p['merchPrice']['currentPrice'], p['merchPrice']['currency'])
        print(p['imageUrls']['productImageUrl'])
        print('-' * 120)

    next_page = data.get('pages', {'next':''})['next']

如何将所有这些鞋子附加在一起以形成字典,可以使用以下命令打印结果:

{% for shoe in shoes['Wall']['products'] %}
    <p>{{shoe}}</p>
    <h2>New shoe</h2>
  {% endfor %}

1 个答案:

答案 0 :(得分:0)

这是一个递归生成器函数,可以在紧要关头完成工作,但是有点混乱。确实需要更多的内容来作为生产代码,例如处理请求错误等,但是它应该使您朝着正确的方向前进。如果有什么令人困惑的地方,请务必提出问题,这里有一些难以理解的概念。

import re
import json
import requests


def get_shoes(url="https://www.nike.com/", path=None):
    response = requests.get(f"{url}{path}")

    try:
        data = response.json()
        products = (
            {
                "title": p["productContent"]["title"],
                "subtitle": p["productContent"]["subtitle"],
                "price": p["merchPrice"]["currentPrice"],
                "currency": p["merchPrice"]["currency"],
                "image_url": p["imageUrls"]["productImageUrl"],
            }
            for i in data["objects"]
            for p in i["productInfo"]
        )
        next_page = data.get("pages", {"next": ""})["next"]

    except json.JSONDecodeError:
        data = json.loads(
            re.search(r"window.INITIAL_REDUX_STATE=(\{.*?\});", response.text).group(1)
        )
        products = (
            {
                "title": p["title"],
                "subtitle": p["subtitle"],
                "price": p["price"]["currentPrice"],
                "currency": p["price"]["currency"],
                "image_url": p["colorways"][0]["images"]["portraitURL"].replace(
                    "w_400", "w_1920"
                ),
            }
            for p in data["Wall"]["products"]
        )
        next_page = data["Wall"]["pageData"]["next"]

    for product in products:
        yield product

    if next_page:
        yield from get_shoes(url, next_page)


shoes = {}

for shoe in get_shoes(path="gb/w/womens-shoes-5e1x6zy7ok"):
    print(shoe["title"])
    print(shoe["subtitle"])
    print(shoe["price"], shoe["currency"])
    print(shoe["image_url"])
    print("-" * 120)