webscraper 的目标是收集产品信息大小 id 等。除了 size
和 psizeID
只是第一个产品数据的重复之外,我已经做对了一切。任何人都可以指出我在正确的方向?我错过了什么......提前谢谢你。
更新代码:
import requests
import time
from bs4 import BeautifulSoup
from discord_webhook import DiscordWebhook, DiscordEmbed
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
}
url = "https://www.jdsports.com.sg/search/dunk/"
productsource = requests.get(url,headers=headers).text
productinfo = BeautifulSoup(productsource, "lxml")
def jdMonitor():
#webscraper
for item in productinfo.select(".itemContainer"):
global plink
global pname
global pID
global price
global imagelink
global pstocklink
global psizeID
global psize
pname = item.find("span",class_='itemTitle').text #product title
price = item.find("span", class_="pri").text #product price
pID = item.find("span", class_="itemOverlay")["data-productsku"] #product sku
imagelink = item.find('img')['data-src'] #product image link
pLinkback = item.a["href"] #to get product page link
plink = f"https://www.jdsports.com.sg{pLinkback}"
pstocklink = (plink + 'stock')
#scraping the for the product size ID
stocksource = requests.get(pstocklink,headers=headers).text
stockpage = BeautifulSoup(stocksource, "lxml")
sizes = []
sizeskus = []
for button in stockpage.select("#productSizeStock > button"):
sizes.append(button.get_text(strip=True))
sizeskus.append(button["data-sku"])
psize = '\n'.join(sizes)
psizeID = '\n'.join(sizeskus)
fileName = 'info.txt'
with open(fileName,'r') as rf:
with open(fileName,'a') as af:
read = rf.read()
if pID not in read:
af.write('\n'+pID)
webhook = DiscordWebhook(url='https://discord.com/api/webhooks/857232347807088660/_bPTPUb783qWztTXCtW8Mx9JUVO6wjBKgXzOquwLSaCMEGhKbWe6C2Gp3CqUV_oI2s29',username="Gizmo Monitors")
embed = DiscordEmbed(title= pname, url=plink , color='ff9999')
embed.set_footer(text='Powered by Gizmo Cookgroup', icon_url='https://media.discordapp.net/attachments/827028442078904321/857360717811417098/new_logo_CG.png?width=670&height=670')
embed.set_timestamp()
embed.set_thumbnail(url=imagelink)
embed.add_embed_field(name='Price', value=price,inline=False)
embed.add_embed_field(name='ID', value=pID,inline=False)
embed.add_embed_field(name='Size', value=psize)
embed.add_embed_field(name='SizeID', value=psizeID)
webhook.add_embed(embed)
response = webhook.execute()
else:
print('Nothing New yet!')
time.sleep(30)
while True:
jdMonitor()
答案 0 :(得分:1)
将下面的代码缩进到 for item in productinfo.select(".itemContainer"):
循环内
sizes = []
sizeskus = []
for button in stockpage.select("#productSizeStock > button"):
...
重构代码,为响应和元素添加异常句柄:
import requests
import time
from bs4 import BeautifulSoup
url = "https://m.jdsports.com.sg/collection/jordan-air-1/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.106 Safari/537.36'
}
jd_file = 'jd.json'
sleep = 60
def get_stock(session: requests.Session, stock_link: str) -> dict:
response = session.get(stock_link)
# add exception handle
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
stock = {button["data-sku"]: button.get_text(strip=True) for button in soup.select("#productSizeStock > button")}
return stock
def get_products() -> dict:
products = dict()
with requests.Session() as session:
session.headers = headers
response = session.get(url)
# add exception handle
response.raise_for_status()
product_soup = BeautifulSoup(response.text, "lxml")
for item in product_soup.select(".itemContainer"):
sku = item.find("span", class_="itemOverlay")["data-productsku"]
product_link = f"https://www.jdsports.com.sg{item.a['href']}"
stock_link = product_link + 'stock'
stock = get_stock(session, stock_link)
products[sku] = {
"title": item.find("span", class_='itemTitle').get_text(strip=True),
"price": item.find("span", class_="pri").get_text(strip=True),
"image": item.find('img')['data-src'],
"link": f"https://www.jdsports.com.sg{item.a['href']}",
"stock": stock
}
return products
def send_discord_message(sku: str, product: dict):
webhook = DiscordWebhook(url='webhook url', username="example")
embed = DiscordEmbed(title=product['title'], url=product['link'], color='ff9999')
embed.set_footer(text='example', icon_url='example')
embed.set_timestamp()
embed.set_thumbnail(url=product['image'])
embed.add_embed_field(name='Price', value=product['price'], inline=False)
embed.add_embed_field(name='ID', value=sku, inline=False)
embed.add_embed_field(name='Size', value='\n'.join(product['stock'].values()))
embed.add_embed_field(name='SizeID', value='\n'.join(product['stock'].keys()))
webhook.add_embed(embed)
response = webhook.execute()
if __name__ == '__main__':
stored_products = dict()
if os.path.exists(jd_file):
with open(jd_file, 'r') as r:
stored_products = json.load(r)
while True:
products = get_products()
for sku, product in products.items():
# Send discord message for new or changed product
if sku not in stored_products or product != stored_products[sku]:
send_discord_message(sku, product)
# Update stored products and save to file
stored_products.update(products)
with open(jd_file, 'w') as w:
json.dump(products, w, indent=2)
time.sleep(sleep)