我正试图从Pearson的网站上获取有关计算机科学的所有书籍(从此URL:https://www.pearson.com/us/higher-education/professional---career/computer-science/computer-science.html开始),但是每个类别的书籍列表都是通过javascript生成的。
我尝试使用Selenium来打开页面,然后使用BeautifulSoup对其进行解析。打开类别页面后,找不到 标签,其中包含有关一本书的所有信息。
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Safari()
driver.get('https://www.pearson.com/us/higher-education/professional---career/computer-science/computer-science.html')
wait = WebDriverWait(driver, 2)
content = driver.page_source
soup = BeautifulSoup(content)
#first I loop through categories
categories = list(driver.find_elements_by_xpath('//ul[@class="category-child-list-level-2"]//a'))
for i in range(len(categories)):
print('CATEGORY : {}/170'.format(i+1))
categories[i].click()
while next_page_link != None:
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.CLASS_NAME, "content-tile-book-box")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
print(soup.findAll('li', attrs={'class':'content-tile-book-box visible'})) #it results always empty
for a in soup.findAll('li', attrs={'class':'content-tile-book-box visible'}):
#I would like to have access to the books' links
book_title_link = a.find_element_by_xpath('/div[@class="wrap-list-block"]//a')
#loop through all the book pages of the current category
next_page_link = driver.find_element_by_xpath('//a[@aria-label="Next"]')
next_page_link.click()
希望您能帮助我,谢谢!
答案 0 :(得分:0)
由于您需要在页面之间来回导航,因此我在这里提供了硒解决方案,并且没有使用BS。我还使用了chromedriver。
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome(executable_path='C:\\Selenium\\chromedriver.exe')
url = 'https://www.pearson.com/us/higher-education/professional---career/computer-science/computer-science.html'
driver.get(url)
#first I loop through categories
categories = list(driver.find_elements_by_xpath('//ul[@class="category-child-list-level-2"]//a'))
Total_Category = len(categories)
for i in range(Total_Category):
WebDriverWait(driver, 10).until(ec.visibility_of_all_elements_located((By.XPATH, '//ul[@class="category-child-list-level-2"]//a')))
categories = list(driver.find_elements_by_xpath('//ul[@class="category-child-list-level-2"]//a'))
print('CATEGORY : {}/170'.format(i+1))
categories[i].click()
print("Category: " + categories[i].text)
try:
#loop through all the book pages of the current category
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//a[@aria-label='Next']")))
next_page_link = driver.find_element_by_xpath('//a[@aria-label="Next"]')
while next_page_link != None:
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.CLASS_NAME, "content-tile-book-box")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
#print(soup.findAll('li', attrs={'class':'content-tile-book-box visible'})) #it results always empty
#for a in soup.findAll('li', attrs={'class':'content-tile-book-box visible'}):
#I would like to have access to the books' links
# book_title_link = a.find_element_by_xpath('//div[@class="wrap-list-block"]//a')
WebDriverWait(driver, 10).until(ec.visibility_of_any_elements_located((By.XPATH, "//div[@class='product-search-results-list section']//li")))
links = driver.find_elements_by_xpath('//div[@class="wrap-list-block"]//a')
print(len(links))
book_links =[link.get_attribute('href') for link in links]
#for link in links:
print(book_links)
try:
next_page_link = driver.find_element_by_xpath('//a[@aria-label="Next"]')
except NoSuchElementException as exception:
print("Reached end of all books in this category")
driver.get(url)#Go back to main listing
break
next_page_link.click()
except TimeoutException as exception:
print("Next button is not available")
WebDriverWait(driver, 10).until(ec.visibility_of_any_elements_located((By.XPATH, "//div[@class='product-search-results-list section']//li")))
links = driver.find_elements_by_xpath('//div[@class="wrap-list-block"]//a')
print(len(links))
book_links =[link.get_attribute('href') for link in links]
#for link in links:
print(book_links)
driver.get(url)#Go back to main listing