使用Beautiful Soup从页面中截取链接,我现在如何遍历这些链接?

时间:2017-07-06 14:37:18

标签: python python-3.x beautifulsoup

这是我检索网页链接的代码。

from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import re

def getExternalLinks(includeURL):
    html = urlopen(includeURL)
    bsObj = soup(html, "html.parser")
    externalLinks = []
    links = bsObj.findAll("a", 
    href=re.compile("^(http://www.homedepot.com/b)"))
    for link in links:
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])

    print(externalLinks)

getExternalLinks("http://www.homedepot.com/")

链接存储在下面的数组中。

['http://www.homedepot.com/b/Appliances/N-5yc1vZbv1w?cm_sp=d-flyout-Appliances', 'http://www.homedepot.com/b/Bath/N-5yc1vZbzb3?cm_sp=d-flyout-Bath_and_Faucets', 'http://www.homedepot.com/b/Decor/N-5yc1vZas6p?cm_sp=d-flyout-Blinds_and_Decor', 'http://www.homedepot.com/b/Building-Materials/N-5yc1vZaqns?cm_sp=d-flyout-Building_Materials', 'http://www.homedepot.com/b/Doors-Windows/N-5yc1vZaqih?cm_sp=d-flyout-Doors_and_Windows', 'http://www.homedepot.com/b/Electrical/N-5yc1vZarcd?cm_sp=d-flyout-Electrical', 'http://www.homedepot.com/b/Flooring/N-5yc1vZaq7r?cm_sp=d-flyout-Flooring_and_Area_Rugs', 'http://www.homedepot.com/b/Hardware/N-5yc1vZc21m', 'http://www.homedepot.com/b/Heating-Venting-Cooling/N-5yc1vZc4k8?cm_sp=d-flyout-Heating_and_Cooling', 'http://www.homedepot.com/b/Kitchen/N-5yc1vZar4i?cm_sp=d-flyout-Kitchen', 'http://www.homedepot.com/b/Outdoors-Garden-Center/N-5yc1vZbx6k?cm_sp=d-flyout-Lawn_and_Garden', 'http://www.homedepot.com/b/Lighting-Ceiling-Fans/N-5yc1vZbvn5?cm_sp=d-flyout-Lighting_and_Ceiling_Fans', 'http://www.homedepot.com/b/Outdoors/N-5yc1vZbx82?cm_sp=d-flyout-Outdoor_Living', 'http://www.homedepot.com/b/Paint/N-5yc1vZar2d?cm_sp=d-flyout-Paint', 'http://www.homedepot.com/b/Plumbing/N-5yc1vZbqew?cm_sp=d-flyout-Plumbing', 'http://www.homedepot.com/b/Storage-Organization/N-5yc1vZas7e?cm_sp=d-flyout-Storage_and_Organization', 'http://www.homedepot.com/b/Tools/N-5yc1vZc1xy']

现在我正在尝试遍历这些链接并转到每个页面并获取信息。当我运行下一个代码时,我收到一些错误继续弹出。

def getInternalLinks(includeLinks):
    internalHTML = urlopen(includeLinks)
    Inner_bsObj = soup(internalHTML, "html.parser")
    internalLinks = []
    inner_links = Inner_bsObj.findAll("a", "href")

    for inner_link in inner_links:
        if inner_link.attrs['href'] is not None:
            if inner_link.attrs['href'] not in internalLinks:
                internalLinks.append(inner_link.attrs['href'])
    print(internalLinks)

getInternalLinks(getExternalLinks("http://www.homedepot.com"))




File "C:/Users/anag/Documents/Python 
Scripts/Webscrapers/BeautifulSoup/HomeDepot/HomeDepotScraper.py", line 20, 
in getInternalLinks
internalHTML = urlopen(includeLinks)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Anaconda3\lib\urllib\request.py", line 517, in open
req.timeout = timeout
AttributeError: 'NoneType' object has no attribute 'timeout'

我应该如何从我存储在externalLinks数组中的每个网页中提取信息?

1 个答案:

答案 0 :(得分:0)

它是一个列表,而不是一个数组。 Python中的数组意味着大多数时候Numpy数组与列表完全不同。

代码的问题在于getExternalLinks()函数返回None,并将其作为参数提供给需要单个URL的getInternalLinks()函数。第一个函数需要返回URL的列表(或集合)而不是(只是)打印它们,然后需要在返回值上循环并将每个URL提供给第二个函数。

两个函数包含几乎相同的代码。没有名称差异,它只是findAll()方法的参数不同。我会把它重构成一个共同的函数。

import re
from urllib.request import urlopen
from bs4 import BeautifulSoup


def get_links(url, attrs=None):
    if attrs is None:
        attrs = dict()
    links = set()
    soup = BeautifulSoup(urlopen(url), 'html.parser')
    for a_node in soup.find_all('a', attrs):
        link = a_node.get('href')
        if link is not None:
            links.add(link)
    return links


def main():
    external_links = get_links(
        'http://www.homedepot.com/',
        {'href': re.compile('^(http://www.homedepot.com/b)')},
    )
    print(external_links)
    for link in external_links:
        # 
        # TODO I am not sure if you really want to filter on <a> elements
        #      with a class of 'href' but that is what your code did, so...
        # 
        internal_links = get_links(link, {'class': 'href'})
        print(internal_links)


if __name__ == '__main__':
    main()