Question

我目前正在创建不同的项目来掌握网络抓取的概念。目前，我正在尝试从鞋履销售网站创建商品数据库，但似乎无法以文本形式获取数据。

我试过了。

from selenium import webdriver 
import time
import requests
from bs4 import BeautifulSoup
import numpy
import statistics
import pandas as pd

offset=0
driver=webdriver.Chrome()
listo=[]

while True:

    driver.get("https://stockx.com/sneakers?page={offset}".format(offset=offset))
    time.sleep(10)
    main_div=driver.find_elements_by_xpath('//*[@id="main-content"]/div[2]/div[2]/div/div')

    for div in main_div:
        links=div.find_elements_by_tag_name("a")

        for link in links:
            namer=(link.get_attribute('href'))
            print(namer)


            offset+=0.05

            listo.append(namer)
            namelist = sorted(set(listo))

            for hreflink in namelist:
                hreflinks=(hreflink)

                driver.get(hreflinks)
                time.sleep(10)

                
           
           
                LastsaleD=driver.find_elements_by_xpath('//[@id="marketsummary"]/div[2]/div/div[1]
/div[1]')
                print(LastsaleD).text






    if offset>30:
        break

Answer 1

在这里使用 Selenium 是一种矫枉过正且效率较低的做法。数据以 json 格式存在于源 html 的 <script> 标签中。对站点做一个简单的请求，用json拉出相关的<script>，然后把json解析成行放入表中。

另外，为什么要增加 offset+=0.05？我理解您为页面上的 20 个项目添加它的逻辑，但为什么不在循环遍历 20 个项目后只增加 1？如果您因任何原因退回 19 件商品或 21 件商品，会发生什么情况？然后你的增量将在循环的其余部分关闭。

无论如何，这是代码。这会让你继续前进。

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import re

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
at_end = False

offset = 0
rows = []
while at_end == False:
    offset+=1
    url = "https://stockx.com/sneakers?page={offset}".format(offset=offset)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    scripts = soup.find_all('script', {'type':'application/ld+json'})
    for script in scripts:
        jsonMatch = re.compile("{.*}")
        jsonStr = jsonMatch.search(str(script))[0]
        jsonData = json.loads(jsonStr)
        if jsonData['@type'] == 'OfferCatalog':
            break

    listings = jsonData['itemListElement']
    for listing in listings:
        item = listing['item']
        offers = item.pop('offers')
        item.update(offers)
        
        if item not in rows:
            rows.append(item)
        else:
            at_end = True
            continue
    print('Page: %s' %offset)

df = pd.DataFrame(rows)

输出：

print(df)
              @type        brand  ... highPrice priceCurrency
0    AggregateOffer       Jordan  ...       165           GBP
1    AggregateOffer       Jordan  ...       226           GBP
2    AggregateOffer       Jordan  ...       321           GBP
3    AggregateOffer       Jordan  ...       159           GBP
4    AggregateOffer       Jordan  ...       190           GBP
..              ...          ...  ...       ...           ...
495  AggregateOffer         Nike  ...       230           GBP
496  AggregateOffer  New Balance  ...       159           GBP
497  AggregateOffer         Nike  ...       152           GBP
498  AggregateOffer         Nike  ...       162           GBP
499  AggregateOffer         Nike  ...       167           GBP

[500 rows x 14 columns]

我似乎无法通过 xpath

1 个答案: