从Google Play商店应用网站中提取评论

时间:2020-04-08 13:10:24

标签: python html selenium google-play screen-scraping

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd

class FindByXpathCss():
    # Declaring variables
    Reviews = []  # List to store final set of reviews
    reviewText = []  # List to store reviews extracted from XPath
    reviewFullText = []

    # Chromedriver path
    driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
    driver.maximize_window()

    baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"

    driver.get(baseUrl)
    # driver.execute_script("scrollBy(0,300);")
    # Scrolling down
    for i in range(20):
        driver.find_element_by_xpath('//*[@id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
        time.sleep(0.5)

    # To click on Show more button
    #btnShowMore = driver.find_element_by_xpath('//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
    # Scrolling to top
    for j in range(10):
        driver.find_element_by_xpath('//*[@id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)

    #for i in range(10):
    review_btn = driver.find_elements_by_xpath("//button[contains(@class,'')][contains(text(),'Full Review')]")
    single_review_btn = driver.find_element_by_xpath("//button[contains(@class,'')][contains(text(),'Full Review')]")
    #time.sleep(1)

具有2个标签的div html标签,其中一个的jsname为“ fbQN7e”,该标签用于保存较大的评论,并且那些评论将具有称为“完整评论”的按钮。同一div html标签中的另一个跨度是“ bN97Pc”,该位置用于容纳较小的评论,而该评论的末尾将没有“完整评论”按钮。我无法获得两种类型的跨度的评论。在这里,我尝试将reviewFullText列表直接写入数据框,但仅获取元素数据类型,而不获取文本。我不知道为什么会这样。

    for btn in review_btn:
        btn.click()
        reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")

    #if(single_review_btn.is_enabled()==False):
        #reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
    ##else:
        #pass

    # Iterating each reviews and appending into list Reviews
    for txtreview in reviewText:
        reviewFullText.append(txtreview.text)

    print(len(reviewFullText))

        # Writing the list values into csv file
    df = pd.DataFrame(reviewFullText)
        #df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
    df.to_csv('Reviews.csv', index=True, encoding='utf-8')

    driver.close()

1 个答案:

答案 0 :(得分:0)

我已修改您的解决方案以从页面中检索所有评论。

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
        driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
        driver.maximize_window()
        baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
        driver.get(baseUrl)

        scrolls = 3
        while True:
            scrolls -= 1
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            time.sleep(3)
            if scrolls < 0:
                break

        buttonClick = WebDriverWait(driver, 30).until(
            EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(@class,'')][contains(text(),'Full Review')]")))
        for element in buttonClick:
            driver.execute_script("arguments[0].click();", element)

        reviewText = WebDriverWait(driver, 30).until(
            EC.presence_of_all_elements_located((By.XPATH, "//*[@class='UD7Dzf']")))


        for textreview in reviewText:
            print textreview.text

        reviewText = WebDriverWait(driver, 30).until(
            EC.presence_of_all_elements_located((By.XPATH, "//*[@class='UD7Dzf']")))

        # reviewText = driver.find_elements_by_xpath("//*[@class='UD7Dzf']")
        for textreview in reviewText:
            print textreview.text

输出:

enter image description here