我试图在OldNavy网页上抓取产品的网址。但是,它只提供部分产品列表而不是整个事物(例如,当有超过8个时,只提供8个URL)。我希望有人可以提供帮助并确定问题所在。
from bs4 import BeautifulSoup
from selenium import webdriver
import html5lib
import platform
import urllib
import urllib2
import json
link = http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
bigDiv = soup.findAll("div", class_="sp_sm spacing_small")
for div in bigDiv:
links = div.findAll("a")
for i in links:
j = j + 1
productUrl = base_url + i["href"]
print productUrl
答案 0 :(得分:2)
此页面使用JavaScript
加载元素,但仅在向下滚动页面时加载。
所谓的"lazy loading"
你也必须滚动页面。
from selenium import webdriver
from bs4 import BeautifulSoup
import time
link = "http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true"
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
# ---
# scrolling
lastHeight = driver.execute_script("return document.body.scrollHeight")
#print(lastHeight)
pause = 0.5
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
#print(lastHeight)
# ---
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
#driver.find_element_by_class_name
divs = soup.find_all("div", class_="sp_sm spacing_small")
for div in divs:
links = div.find_all("a")
for link in links:
print base_url + link["href"]