如何在Python中从Lazada抓取所有产品评论

时间:2020-01-14 13:28:45

标签: python selenium web-scraping

我目前正在使用python中的硒,从lazada站点进行网络数据抓取: https://www.lazada.sg/products/loreal-paris-uv-perfect-even-complexion-sunscreen-spf50pa-30ml-i214861100-s325723972.html?spm=a2o42.seller.list.1.75895319pt8HKU&mp=1

但是,我只能提取产品评论的第一页。有人知道如何从page2提取评论吗?

以下是代码(但存在以下代码无法点击Element的错误):

from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
from selenium.webdriver.chrome.options import Options


url = 'https://www.lazada.sg/products/loreal-paris-uv-perfect-even-complexion-sunscreen-spf50pa-30ml-i214861100-s325723972.html?spm=a2o42.seller.list.1.75895319pt8HKU&mp=1'
chrome_options = Options()
#chrome_options.add_argument("--headless")

driver = webdriver.Chrome(executable_path='chromedriver',chrome_options=chrome_options)
driver.get(url)
time.sleep(0.1)

review_csv=[]
product_csv = []
rating_csv =[]
date_review_csv = []

titles = driver.find_element_by_class_name('pdp-mod-product-badge-title').text
print(titles)
product_reviews = driver.find_elements_by_css_selector("[class='item']")

urls = []

#Page 1 of product review
for product in product_reviews :

    review = product.find_element_by_css_selector("[class='content']").text
    if(review != "" or review.strip()):
        print(review)
        review_csv.append(review)
    else:
        print(review)
        review_csv.append("No comments/review is an image")

    #Product Purchase
    #Check if the product purchase exists

    product_purchase = product.find_element_by_css_selector("[class='skuInfo']").text
    print(product_purchase)
    product_csv.append(product_purchase)


    #Star rating
    star_ratings = product.find_elements_by_css_selector("[class='star']")
    stars = "https://laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png"

    star_rate = 0
    for rating in star_ratings:
        #print(rating.get_attribute('src'))
        if(rating.get_attribute('src') == stars):
            star_rate = star_rate + 1
    rating_csv.append(star_rate)
    print(star_rate)

    # Date of Review
    date = product.find_element_by_css_selector("[class='title right']").text
    date_review_csv.append(date)
    print(date)

#Page 2 of product review onwards
page2_product_reviews = driver.find_element_by_xpath('//*[@id="module_product_review"]/div/div[3]/div[2]/div/div/button[2]').click()
for product in page2_product_reviews :

    review = product.find_element_by_css_selector("[class='content']").text
    if(review != "" or review.strip()):
        print(review)
        review_csv.append(review)
    else:
        print(review)
        review_csv.append("No comments/review is an image")

    #Product Purchase
    #Check if the product purchase exists
    product_purchase = product.find_element_by_css_selector("[class='skuInfo']").text
    print(product_purchase)
    product_csv.append(product_purchase)


    #Star rating
    star_ratings = product.find_elements_by_css_selector("[class='star']")
    stars = "https://laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png"

    star_rate = 0
    for rating in star_ratings:
        #print(rating.get_attribute('src'))
        if(rating.get_attribute('src') == stars):
            star_rate = star_rate + 1
    rating_csv.append(star_rate)
    print(star_rate)

    # Date of Review
    date = product.find_element_by_css_selector("[class='title right']").text
    date_review_csv.append(date)
    print(date)


driver.close()

提前谢谢!

1 个答案:

答案 0 :(得分:0)

要进行分页,请在循环中使用infinite,并#Check for button ({strong}按钮)next-pagination-item具有**disable**属性,然后从循环跳转,否则单击下一个按钮。 / p>

代码

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

driver=webdriver.Chrome(executable_path='chromedriver')
driver.get("https://www.lazada.sg/products/loreal-paris-uv-perfect-even-complexion-sunscreen-spf50pa-30ml-i214861100-s325723972.html?spm=a2o42.seller.list.1.758953196tH2Mn&mp=1")
review_csv=[]
product_csv = []
rating_csv =[]
date_review_csv = []
titles = driver.find_element_by_class_name('pdp-mod-product-badge-title').text
print(titles)
while True:
      #Get the review details here
      WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"div.item")))
      product_reviews = driver.find_elements_by_css_selector("[class='item']")

      # Get product review
      for product in product_reviews:

          review = product.find_element_by_css_selector("[class='content']").text
          if (review != "" or review.strip()):
              print(review)
              review_csv.append(review)
          else:
              print(review)
              review_csv.append("No comments/review is an image")

          # Product Purchase
          # Check if the product purchase exists

          product_purchase = product.find_element_by_css_selector("[class='skuInfo']").text
          print(product_purchase)
          product_csv.append(product_purchase)

          # Star rating
          star_ratings = product.find_elements_by_css_selector("[class='star']")
          stars = "https://laz-img-cdn.alicdn.com/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png"

          star_rate = 0
          for rating in star_ratings:
              # print(rating.get_attribute('src'))
              if (rating.get_attribute('src') == stars):
                  star_rate = star_rate + 1
          rating_csv.append(star_rate)
          print(star_rate)

          # Date of Review
          date = product.find_element_by_css_selector("[class='title right']").text
          date_review_csv.append(date)
          print(date)

      #Check for button next-pagination-item have disable attribute then jump from loop else click on the next button
      if len(driver.find_elements_by_css_selector("button.next-pagination-item.next[disabled]"))>0:
          break;
      else:
          button_next=WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button.next-pagination-item.next")))
          driver.execute_script("arguments[0].click();", button_next)
          print("next page")
          time.sleep(2)
driver.close()
print(review_csv)
print(product_csv)
print(rating_csv)
print(date_review_csv)

列表打印如下:

['Fast delivery, send within 3 days, in bubble envelope. Product expiry date : 0522', 'received in good condition. have not try it yet', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image', 'No comments/review is an image']

['Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30', 'Volume (ml):30']

[5, 5, 5, 4, 4, 5, 5, 5]

['24 Oct 2019', '17 Nov 2019', '21 Nov 2019', '25 Oct 2019', '29 Aug 2019', '24 Apr 2019', '19 Jan 2019', '11 Nov 2018']