我目前正在做一个网络抓取项目,其中包括从Delhaize网站抓取产品,其价格和可能的折扣。使用我的代码,我可以得到正确数量的产品,但是有些产品没有价格和折扣,因此,我试图逐个产品地寻找正确价格的产品。但是,我得不到正确的量,无论是太多还是太多。
你能帮我吗?我的代码如下:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
import time
myProxy = {
"http" : "http://10.120.118.49:8080",
"https" : "https://10.120.118.49:8080"
}
headers={'User-agent' : 'Mozilla/5.0'}
Product=[]
Price=[]
Discount=[]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--proxy-server=http://10.120.118.49:8080")
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path='C:/Users/C71220/chromedriver.exe', options=chrome_options)
for u in range(0,6):
url='https://www.delhaize.be/nl-be/shop/Dranken-en-alcohol/c/v2DRI?q=:relevance:manufacturerNameFacet:Coca-Cola:manufacturerNameFacet:Schweppes:manufacturerNameFacet:Fanta:manufacturerNameFacet:Chaudfontaine&sort=relevance&pageNumber=' + str(u)
driver.get(url)
try:
# makes the scraper wait until the element is loaded on the website
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'data-item')))
for products in driver.find_elements_by_xpath("//div[@class='description anchor--no-style']"):
Product.append(products.text.strip('\n'))
product=driver.find_elements_by_xpath("//div[@class='layout-basket-area']")
for i in product:
prices=i.find_elements_by_xpath("//span[@class='quantity-price super-bold']")
for a in prices:
if a is not None:
Price.append(a.text)
else:
Price.append('')
promotions=i.find_element_by_xpath("//div[@class='PromotionStickerWrapper']")
if promotions is not None:
Discount.append(promotions)
else:
Discount.append(promotions)
print('Scraping...')
except (NoSuchElementException, TimeoutException):
pass
print(Product, Price, Discount)
print(len(Product))
print(len(Price))
print(len(Discount))
编辑:
价格的HTML代码如下:
<div class="layout-basket-area"...<div>
<span class="quantity-price super-bold">
折扣为:
<div class="layout-basket-area"...<div>
<div class="layout-shot">...<div>
<div class="PromotionStickerWrapper"> title="- 25% voor 2">
答案 0 :(得分:2)
代码中有太多错误无法修复。我重写了一些部分并添加了评论。试试这个:
for u in range(0,6):
url='https://www.delhaize.be/nl-be/shop/Dranken-en-alcohol/c/v2DRI?q=:relevance:manufacturerNameFacet:Coca-Cola:manufacturerNameFacet:Schweppes:manufacturerNameFacet:Fanta:manufacturerNameFacet:Chaudfontaine&sort=relevance&pageNumber=' + str(u)
driver.get(url)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'data-item')))
for product in driver.find_elements_by_class_name("data-item"):
# get the product list item by class name
product_name = product.find_element_by_class_name("ProductHeader").text.replace("\n", " - ")
# try to get the price span by class name with the product list item html else set it to zero
try:
product_price = product.find_element_by_class_name("quantity-price").text
# clean the price by replace € and , and convert it to float
float_product_price = float(product_price.replace("€","").replace(",","."))
except NoSuchElementException:
product_price = "0"
float_product_price = 0
# try to get the discount span by class name with the product list item html else set it to zero
try:
product_discount = product.find_element_by_class_name("multiLinePromotion").text
# clean the discount by replace - % € and , and convert it to float
float_product_discount = float (product_discount.replace("- ","").replace("%","").replace("€","").replace(",","."))
except NoSuchElementException:
product_discount ="0"
float_product_discount = 0
Product.append(product_name)
Price.append(float_product_price)
Discount.append(float_product_discount)
print(Product, Price, Discount)
print(len(Product))
print(len(Price))
print(len(Discount))