网络爬虫重复值

时间:2021-06-23 21:22:28

标签: python web web-scraping beautifulsoup python-requests

webscraper 的目标是收集产品信息大小 id 等。除了 sizepsizeID 只是第一个产品数据的重复之外,我已经做对了一切。任何人都可以指出我在正确的方向?我错过了什么......提前谢谢你。

更新代码:

import requests
import time
from bs4 import BeautifulSoup
from discord_webhook import DiscordWebhook, DiscordEmbed

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
}
url = "https://www.jdsports.com.sg/search/dunk/"
productsource = requests.get(url,headers=headers).text
productinfo = BeautifulSoup(productsource, "lxml")

def jdMonitor():
    #webscraper
    for item in productinfo.select(".itemContainer"):

        global plink
        global pname
        global pID
        global price
        global imagelink
        global pstocklink
        global psizeID
        global psize
        

        pname = item.find("span",class_='itemTitle').text                  #product title
        price = item.find("span", class_="pri").text                       #product price
        pID = item.find("span", class_="itemOverlay")["data-productsku"]   #product sku
        imagelink = item.find('img')['data-src']                           #product image link
        pLinkback = item.a["href"]                                         #to get product page link
        plink = f"https://www.jdsports.com.sg{pLinkback}"
        pstocklink = (plink + 'stock')


        #scraping the for the product size ID
        stocksource = requests.get(pstocklink,headers=headers).text
        stockpage = BeautifulSoup(stocksource, "lxml")

        sizes = []
        sizeskus = []
        for button in stockpage.select("#productSizeStock > button"):
            sizes.append(button.get_text(strip=True))
            sizeskus.append(button["data-sku"])
            psize = '\n'.join(sizes)
            psizeID = '\n'.join(sizeskus)

        fileName = 'info.txt'
        with open(fileName,'r') as rf:
            with open(fileName,'a') as af:
                read = rf.read()
                if pID not in read:
                    af.write('\n'+pID)
                    webhook = DiscordWebhook(url='https://discord.com/api/webhooks/857232347807088660/_bPTPUb783qWztTXCtW8Mx9JUVO6wjBKgXzOquwLSaCMEGhKbWe6C2Gp3CqUV_oI2s29',username="Gizmo Monitors")
                    embed = DiscordEmbed(title= pname, url=plink , color='ff9999')
                    embed.set_footer(text='Powered by Gizmo Cookgroup', icon_url='https://media.discordapp.net/attachments/827028442078904321/857360717811417098/new_logo_CG.png?width=670&height=670')
                    embed.set_timestamp()
                    embed.set_thumbnail(url=imagelink)
                    embed.add_embed_field(name='Price', value=price,inline=False)
                    embed.add_embed_field(name='ID', value=pID,inline=False)
                    embed.add_embed_field(name='Size', value=psize)
                    embed.add_embed_field(name='SizeID', value=psizeID)
                    webhook.add_embed(embed)
                    response = webhook.execute()
                    
                else: 
                    print('Nothing New yet!')                    
    time.sleep(30)
   
while True:
    jdMonitor()
    

1 个答案:

答案 0 :(得分:1)

将下面的代码缩进到 for item in productinfo.select(".itemContainer"): 循环内

sizes = []
    sizeskus = []
    for button in stockpage.select("#productSizeStock > button"):
    ...

重构代码,为响应和元素添加异常句柄:

import requests

import time
from bs4 import BeautifulSoup

url = "https://m.jdsports.com.sg/collection/jordan-air-1/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/91.0.4472.106 Safari/537.36'
}
jd_file = 'jd.json'
sleep = 60


def get_stock(session: requests.Session, stock_link: str) -> dict:
    response = session.get(stock_link)
    # add exception handle
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "lxml")
    stock = {button["data-sku"]: button.get_text(strip=True) for button in soup.select("#productSizeStock > button")}
    return stock


def get_products() -> dict:
    products = dict()

    with requests.Session() as session:
        session.headers = headers

        response = session.get(url)
        # add exception handle
        response.raise_for_status()

        product_soup = BeautifulSoup(response.text, "lxml")

        for item in product_soup.select(".itemContainer"):
            sku = item.find("span", class_="itemOverlay")["data-productsku"]
            product_link = f"https://www.jdsports.com.sg{item.a['href']}"
            stock_link = product_link + 'stock'

            stock = get_stock(session, stock_link)

            products[sku] = {
                "title": item.find("span", class_='itemTitle').get_text(strip=True),
                "price": item.find("span", class_="pri").get_text(strip=True),
                "image": item.find('img')['data-src'],
                "link": f"https://www.jdsports.com.sg{item.a['href']}",
                "stock": stock
            }
        return products


def send_discord_message(sku: str, product: dict):
    webhook = DiscordWebhook(url='webhook url', username="example")

    embed = DiscordEmbed(title=product['title'], url=product['link'], color='ff9999')
    embed.set_footer(text='example', icon_url='example')
    embed.set_timestamp()
    embed.set_thumbnail(url=product['image'])
    embed.add_embed_field(name='Price', value=product['price'], inline=False)
    embed.add_embed_field(name='ID', value=sku, inline=False)
    embed.add_embed_field(name='Size', value='\n'.join(product['stock'].values()))
    embed.add_embed_field(name='SizeID', value='\n'.join(product['stock'].keys()))

    webhook.add_embed(embed)
    response = webhook.execute()


if __name__ == '__main__':
    stored_products = dict()
    if os.path.exists(jd_file):
        with open(jd_file, 'r') as r:
            stored_products = json.load(r)

    while True:
        products = get_products()

        for sku, product in products.items():
            # Send discord message for new or changed product
            if sku not in stored_products or product != stored_products[sku]:
                send_discord_message(sku, product)

        # Update stored products and save to file
        stored_products.update(products)

        with open(jd_file, 'w') as w:
            json.dump(products, w, indent=2)

        time.sleep(sleep)