抓取google play / python

时间:2017-10-03 04:24:03

标签: python web-scraping phantomjs google-crawlers

我有这个代码来抓取google play商店但是这段代码有点问题。

除了发生之后再次从头开始爬行我想继续爬行,除了

请告诉我您是否可以更好地改进此代码,或者如果您有一个好主意

from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd
#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()
#setting  up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"
#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()
sleep(1)
driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[@data-dropdown-value='1']").click()

driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []
for i in range(1,10):
    try:
        for elem in driver.find_elements_by_class_name('single-review'):

        print(str(i))
        content = elem.get_attribute('outerHTML')

        soup = BeautifulSoup(content, "html.parser")
        #print(soup.prettify())
        date = soup.find('span',class_='review-date').get_text()
        rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
        title = soup.find('span',class_='review-title').get_text()
        txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
        print(soup.get_text())
        temp = pd.DataFrame({'Date':date,'Rating':rating,'Review Title':title,'Review Text':txt},index=[0])
        print('-'*10)
        reviews_df.append(temp)

        #print(elem)
    except:
        print('what i can do?')
        driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
        #driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
    #driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)
reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')
driver.close()

0 个答案:

没有答案