我正在努力完善使用Chrome WebDriver来抓取网页的网络抓取工具。它目前在第74行打破:
soup = BeautifulSoup(HTML, "html.parser")
错误代码:
AttributeError:'str'对象没有属性'text'。
我该如何解决这个问题?我不确定为什么它会在这一点上不断打破。
import urllib2, sys
from BeautifulSoup import BeautifulSoup
from datetime import datetime
import requests
from lxml import html
import traceback
import csv
import time
import json
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
username = "user" # your email here
password = "pass" # your password here
pages = 10
companyName = "Apple"
companyURL = "https://www.glassdoor.com/Reviews/Apple-US-Reviews-EI_IE7438.0,6_IL.7,9_IN1.htm?filter.defaultEmploymentStatuses=false&filter.defaultLocation=false"
def obj_dict(obj):
return obj.__dict__
#enddef
def json_export(data):
jsonFile = open(companyName + ".json", "w")
jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
jsonFile.close()
#enddef
def init_driver():
driver = webdriver.Chrome('C:\Python27\chromedriver.exe')
driver.wait = WebDriverWait(driver, 10)
return driver
#enddef
def login(driver, username, password):
driver.get("http://www.glassdoor.com/profile/login_input.htm")
try:
user_field = driver.wait.until(EC.presence_of_element_located(
(By.NAME, "username")))
pw_field = driver.find_element_by_class_name("signin-password")
login_button = driver.find_element_by_id("signInBtn")
user_field.send_keys(username)
user_field.send_keys(Keys.TAB)
time.sleep(1)
pw_field.send_keys(password)
time.sleep(1)
login_button.click()
except TimeoutException:
print("TimeoutException! Username/password field or login button not found on glassdoor.com")
#enddef
###
def get_data(driver, URL, startPage, endPage, data, refresh):
if (startPage > endPage):
return data
#endif
print "\nPage " + str(startPage) + " of " + str(endPage)
currentURL = URL + "_IP" + str(startPage) + ".htm"
time.sleep(2)
#endif
if (refresh):
driver.get(currentURL)
print "Getting " + currentURL
#endif
time.sleep(2)
HTML = driver.page_source
soup = BeautifulSoup(HTML, "html.parser")
reviews = soup.find_all("li", { "class" : ["empReview", "padVert"] })
if (reviews):
data = parse_reviews_HTML(reviews, data)
print "Page " + str(startPage) + " scraped."
if (startPage % 10 == 0):
print "\nTaking a breather for a few seconds ..."
time.sleep(10)
#endif
get_data(driver, URL, startPage + 1, endPage, data, True)
else:
print "Waiting ... page still loading or CAPTCHA input required"
time.sleep(3)
get_data(driver, URL, startPage, endPage, data, False)
#endif
return data
#enddef
if __name__ == "__main__":
driver = init_driver()
time.sleep(3)
print "Logging into Glassdoor account ..."
login(driver, username, password)
time.sleep(5)
print "\nStarting data scraping ..."
data = get_data(driver, companyURL[:-4], 1, pages, [], True)
print "\nExporting data to " + Apple + ".json"
json_export(data)
driver.quit()
#endif
summary_box = soup.find('span', attrs={'class': 'summary '})
summary = summary_box.text.strip()
print summary
答案 0 :(得分:1)
你可能正在使用BeautifulSoup版本3(我试过它并且出现了问题)。即使不是这种情况,也可以尝试删除" html.parser"争论,只做:
soup = BeautifulSoup(HTML)
我希望它能够发挥作用。)