我在刮下下面的网页时似乎无法检索到网址...你能帮忙吗?
import platform
from bs4 import BeautifulSoup
from selenium import webdriver
def main():
# PhantomJS files have different extensions
# under different operating systems
if platform.system() == 'Windows':
PHANTOMJS_PATH = './phantomjs.exe'
else:
PHANTOMJS_PATH = './phantomjs'
# here we'll use pseudo browser PhantomJS,
# but browser can be replaced with browser = webdriver.FireFox(),
# which is good for debugging.
browser = webdriver.PhantomJS(PHANTOMJS_PATH)
for page in range(1):
page_count = str(page)
browser.get('http://www.zappos.com/men-sneakers-athletic-shoes/CK_XARC81wHAAQLiAgMBAhg.zso?p=' + str(page) + '&s=brandNameFacetLC/asc/productName/asc/')
# let's parse our html
soup = BeautifulSoup(browser.page_source, "html.parser")
# get all the sneaker items
sneakers = soup.find_all('a', {'itemtype': 'http://schema.org/Product'})
sneaker_count = len(sneakers)
print str(sneaker_count) + " sneakers found on page " + page_count
for sneaker in sneakers:
url = get_sneaker_url(sneaker)
print url
def get_sneaker_url(sneaker):
for url in sneaker.findAll('a')['href']:
return url
main()
我收到以下错误消息:
C:\Python27\python.exe D:/sneaker-image-scraper/main.py
100 sneakers found on page 0
Traceback (most recent call last):
File "D:/sneaker-image-scraper/main.py", line 41, in <module>
main()
File "D:/sneaker-image-scraper/main.py", line 32, in main
url = get_sneaker_url(sneaker)
File "D:/sneaker-image-scraper/main.py", line 37, in get_sneaker_url
for url in sneaker.findAll('a')['href']:
TypeError: list indices must be integers, not str
Process finished with exit code 1
答案 0 :(得分:2)
sneakers = soup.find_all('a', {'itemtype': 'http://schema.org/Product'})
find_all
返回一个列表对象,它的项sneaker
是标记对象,可能包含href
属性
for sneaker in sneakers:
url = sneaker['href']
print url
标记对象的属性存储在dict中,您可以使用tag['attribute']
来访问它。
答案 1 :(得分:1)
您不需要for sneaker in sneakers:
url = sneaker['href']
print url
功能,只需使用以下代码:
{{1}}