抓取谷歌应用商店错误

时间:2017-09-15 06:52:12

标签: python web-scraping phantomjs web-crawler app-store

我使用此代码来抓取此代码。 前几个出来很好 但是错误 我想要你的建议。 我该怎么办?

from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup, Comment
import pandas as pd


#Setting up Chrome webdriver Options
#chrome_options = webdriver.ChromeOptions()

#setting  up local path of chrome binary file
#chrome_options.binary_location = "/Users/Norefly/chromedriver2/chromedriver.exec"

#creating Chrome webdriver instance with the set chrome_options
driver = webdriver.PhantomJS("C:/Python/phantomjs-2.1.1-windows/bin/phantomjs.exe")
link = "https://play.google.com/store/apps/details?id=com.supercell.clashofclans&hl=en"
driver.get(link)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
Ptitle = driver.find_element_by_class_name('id-app-title').text.replace(' ','')
print(Ptitle)
#driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]').click()

sleep(1)
driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
#select_newest.select_by_visible_text('Newest')
#driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
sleep(2)
#driver.find_element_by_css_selector('.review-filter.id-review-sort-filter.dropdown-menu-container').click()
driver.find_element_by_css_selector('.displayed-child').click()
#driver.find_element_by_xpath("//button[@data-dropdown-value='1']").click()
driver.execute_script("document.querySelectorAll('button.dropdown-child')[0].click()")
reviews_df = []

for i in range(1,10000):
try:for elem in driver.find_elements_by_class_name('single-review'):
print(str(i))
content = elem.get_attribute('outerHTML')
soup = BeautifulSoup(content, "html.parser")
#print(soup.prettify())
date = soup.find('span',class_='review-date').get_text()
rating = soup.find('div',class_='tiny-star')['aria-label'][6:7]
title = soup.find('span',class_='review-title').get_text()
txt = soup.find('div',class_='review-body').get_text().replace('Full Review','')[len(title)+1:]
print(soup.get_text())
temp = pd.DataFrame({'Date':date,'Rating':rating,'Review 
Title':title,'Review Text':txt},index=[0])
print('-'*10)
reviews_df.append(temp)
#print(elem)

except:
    print('s')
driver.find_element_by_xpath('//*[@id="body-content"]/div/div/div[1]/div[2]/div[2]/div[1]/div[4]/button[2]/div[2]/div/div').click()
reviews_df = pd.concat(reviews_df,ignore_index=True)

reviews_df.to_csv(Ptitle+'review_google.csv', encoding='utf-8')

#driver.close()

抓取期间发生此错误,但我不明白此错误。

操作系统是Windows,我正在用Python分析并使用phantomjs。

* google playstore抓取

raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementNotVisibleException: 
    Message: {
        "errorMessage": "Element is not currently visible and may not be manipulated",
        "request":{
            "headers":{
                "Accept":"application/json",
                "Accept-Encoding":"identity",
                "Connection":"close",
                "Content-Length":"81",
                "Content-Type":"application/json; charset=UTF-8",
                "Host":"127.0.0.1:58041",
                "User-Agent":"Python http auth"
            },
            "httpVersion":"1.1",
            "method":"POST",
            "post":"{
                \"id\":  \":wdc:1505360987512\", 
                \"sessionId\": \"b7c59070-98ff-11e7-8363-fdfc8cdfd230 \"
            }",
            "url":"/click",
            "urlParsed": {
                 "anchor":"",
                 "query":"",
                 "file":"click",
                 "directory":"/",
                 "path":" /click",
                 "relative":" /click",
                 "port":"",
                 "host":"",
                 "password":"",
                 "user":"",
                 "userInfo":"",
                 "authority":" ",
                 "protocol":"",
                 "source":"/click",
                 "queryKey":{},
                 "chunks": ["click"]
             },
             "urlOriginal":"/session/b7c59070-98ff-11e7-8363-fdfc8cdfd230/element /:wdc:1505360987512/click"
         }
     }

屏幕截图:可通过屏幕获取

0 个答案:

没有答案