我目前正在使用python中的硒,从lazada站点进行网络数据抓取: https://www.lazada.sg/products/loreal-paris-uv-perfect-even-complexion-sunscreen-spf50pa-30ml-i214861100-s325723972.html?spm=a2o42.seller.list.1.75895319pt8HKU&mp=1
但是,我只能提取产品评论的第一页。有人知道如何从page2提取评论吗?
以下是代码(但存在以下代码无法点击Element的错误):
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
from selenium.webdriver.chrome.options import Options
url = 'https://www.lazada.sg/products/loreal-paris-uv-perfect-even-complexion-sunscreen-spf50pa-30ml-i214861100-s325723972.html?spm=a2o42.seller.list.1.75895319pt8HKU&mp=1'
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path='chromedriver',chrome_options=chrome_options)
driver.get(url)
time.sleep(0.1)
review_csv=[]
product_csv = []
rating_csv =[]
date_review_csv = []
titles = driver.find_element_by_class_name('pdp-mod-product-badge-title').text
print(titles)
product_reviews = driver.find_elements_by_css_selector("[class='item']")
urls = []
#Page 1 of product review
for product in product_reviews :
review = product.find_element_by_css_selector("[class='content']").text
if(review != "" or review.strip()):
print(review)
review_csv.append(review)
else:
print(review)
review_csv.append("No comments/review is an image")
#Product Purchase
#Check if the product purchase exists
product_purchase = product.find_element_by_css_selector("[class='skuInfo']").text
print(product_purchase)
product_csv.append(product_purchase)
#Star rating
star_ratings = product.find_elements_by_css_selector("[class='star']")
stars = "https://laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png"
star_rate = 0
for rating in star_ratings:
#print(rating.get_attribute('src'))
if(rating.get_attribute('src') == stars):
star_rate = star_rate + 1
rating_csv.append(star_rate)
print(star_rate)
# Date of Review
date = product.find_element_by_css_selector("[class='title right']").text
date_review_csv.append(date)
print(date)
#Page 2 of product review onwards
page2_product_reviews = driver.find_element_by_xpath('//*[@id="module_product_review"]/div/div[3]/div[2]/div/div/button[2]').click()
for product in page2_product_reviews :
review = product.find_element_by_css_selector("[class='content']").text
if(review != "" or review.strip()):
print(review)
review_csv.append(review)
else:
print(review)
review_csv.append("No comments/review is an image")
#Product Purchase
#Check if the product purchase exists
product_purchase = product.find_element_by_css_selector("[class='skuInfo']").text
print(product_purchase)
product_csv.append(product_purchase)
#Star rating
star_ratings = product.find_elements_by_css_selector("[class='star']")
stars = "https://laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png"
star_rate = 0
for rating in star_ratings:
#print(rating.get_attribute('src'))
if(rating.get_attribute('src') == stars):
star_rate = star_rate + 1
rating_csv.append(star_rate)
print(star_rate)
# Date of Review
date = product.find_element_by_css_selector("[class='title right']").text
date_review_csv.append(date)
print(date)
driver.close()
提前谢谢!
答案 0 :(得分:0)
要进行分页,请在循环中使用infinite
,并#Check for button ({strong}按钮)next-pagination-item
具有**disable**
属性,然后从循环跳转,否则单击下一个按钮。 / p>
代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
driver=webdriver.Chrome(executable_path='chromedriver')
driver.get("https://www.lazada.sg/products/loreal-paris-uv-perfect-even-complexion-sunscreen-spf50pa-30ml-i214861100-s325723972.html?spm=a2o42.seller.list.1.758953196tH2Mn&mp=1")
review_csv=[]
product_csv = []
rating_csv =[]
date_review_csv = []
titles = driver.find_element_by_class_name('pdp-mod-product-badge-title').text
print(titles)
while True:
#Get the review details here
WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"div.item")))
product_reviews = driver.find_elements_by_css_selector("[class='item']")
# Get product review
for product in product_reviews:
review = product.find_element_by_css_selector("[class='content']").text
if (review != "" or review.strip()):
print(review)
review_csv.append(review)
else:
print(review)
review_csv.append("No comments/review is an image")
# Product Purchase
# Check if the product purchase exists
product_purchase = product.find_element_by_css_selector("[class='skuInfo']").text
print(product_purchase)
product_csv.append(product_purchase)
# Star rating
star_ratings = product.find_elements_by_css_selector("[class='star']")
stars = "https://laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png"
star_rate = 0
for rating in star_ratings:
# print(rating.get_attribute('src'))
if (rating.get_attribute('src') == stars):
star_rate = star_rate + 1
rating_csv.append(star_rate)
print(star_rate)
# Date of Review
date = product.find_element_by_css_selector("[class='title right']").text
date_review_csv.append(date)
print(date)
#Check for button next-pagination-item have disable attribute then jump from loop else click on the next button
if len(driver.find_elements_by_css_selector("button.next-pagination-item.next[disabled]"))>0:
break;
else:
button_next=WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.next-pagination-item.next")))
driver.execute_script("arguments[0].click();", button_next)
print("next page")
time.sleep(2)
driver.close()
print(review_csv)
print(product_csv)
print(rating_csv)
print(date_review_csv)
列表打印如下:
['Fast delivery, send within 3 days, in bubble envelope. Product expiry date : 0522', 'received in good condition. have not try it yet', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image']
['Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30']
[5, 5, 5, 4, 4, 5, 5, 5]
['24 Oct 2019', '17 Nov 2019', '21 Nov 2019', '25 Oct 2019', '29 Aug 2019', '24 Apr 2019', '19 Jan 2019', '11 Nov 2018']