Question

我在刮下下面的网页时似乎无法检索到网址...你能帮忙吗？

import platform
from bs4 import BeautifulSoup
from selenium import webdriver


def main():
    # PhantomJS files have different extensions
    # under different operating systems
    if platform.system() == 'Windows':
        PHANTOMJS_PATH = './phantomjs.exe'
    else:
        PHANTOMJS_PATH = './phantomjs'

    # here we'll use pseudo browser PhantomJS,
    # but browser can be replaced with browser = webdriver.FireFox(),
    # which is good for debugging.
    browser = webdriver.PhantomJS(PHANTOMJS_PATH)

    for page in range(1):
        page_count = str(page)
        browser.get('http://www.zappos.com/men-sneakers-athletic-shoes/CK_XARC81wHAAQLiAgMBAhg.zso?p=' + str(page) + '&s=brandNameFacetLC/asc/productName/asc/')

        # let's parse our html
        soup = BeautifulSoup(browser.page_source, "html.parser")

        # get all the sneaker items
        sneakers = soup.find_all('a', {'itemtype': 'http://schema.org/Product'})
        sneaker_count = len(sneakers)
        print str(sneaker_count) + " sneakers found on page " + page_count

        for sneaker in sneakers:
            url = get_sneaker_url(sneaker)
            print url


def get_sneaker_url(sneaker):
    for url in sneaker.findAll('a')['href']:
        return url


main()

我收到以下错误消息：

C:\Python27\python.exe D:/sneaker-image-scraper/main.py
100 sneakers found on page 0
Traceback (most recent call last):
  File "D:/sneaker-image-scraper/main.py", line 41, in <module>
    main()
  File "D:/sneaker-image-scraper/main.py", line 32, in main
    url = get_sneaker_url(sneaker)
  File "D:/sneaker-image-scraper/main.py", line 37, in get_sneaker_url
    for url in sneaker.findAll('a')['href']:
TypeError: list indices must be integers, not str

Process finished with exit code 1

Answer 1

sneakers = soup.find_all('a', {'itemtype': 'http://schema.org/Product'})

find_all返回一个列表对象，它的项sneaker是标记对象，可能包含href属性

    for sneaker in sneakers:
        url = sneaker['href']
        print url

标记对象的属性存储在dict中，您可以使用tag['attribute']来访问它。

Answer 2

您不需要for sneaker in sneakers: url = sneaker['href'] print url功能，只需使用以下代码：

{{1}}

解析时无法检索url

2 个答案: