从存储在图表中的链接保存网页(Python)

时间:2018-02-22 01:40:20

标签: python recursion graph beautifulsoup

import requests
import os
from pythonds.graphs import Graph,Vertex,adjGraph
from bs4 import BeautifulSoup

root="https://www.yellowpages.com/los-angeles-ca/coffee-shops"

# Function to fetch the links contained in a Web Page
def fetchHref(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.content, "html.parser")
    links = soup.find_all("a")
    linksList = []
    for link in links:
        if type(link.get("href")) is str:
            if "http" in link.get("href"):
                linksList.append(link.get("href"))
    return linksList

#Function to make edges from a webpage to its corresponding links
def ev(g,f,t=None):
    if g.__contains__(f):
        g.addVertex(t)
        g.addEdge(f, t, 1)
    else:
        g.addVertex(f)

#Makes a graph for a root webpage to its constituents upto a specified depth
def make_graph(root,depth=None):
    #fun=0
    terminate=0
    g = Graph()
    ev(g, root)
    rootList=[]
    tempList=[]
    rootList.append(root)
    while True:
        for j in range(len(rootList)):
            lst = fetchHref(rootList[j])
            if not os.path.exists(lpath):
                os.makedirs(lpath)
            for ctr in range(len(lst)):
                if rootList[j]!=lst[ctr]:
                    ev(g,rootList[j],lst[ctr])
                    tempList.append(lst[ctr])
        rootList=[]
        rootList=tempList
        tempList=[]
        terminate = terminate+1
        if terminate == depth:
            break
    return g

g=make_graph(root,1) #Resultant Graph

**我的问题陈述是我需要下载与根网页相关的特定深度的网页,并将它们存储在目录结构中。

我需要将存储在图表中的所有链接保存在层级文件夹结构中。我很难建立递归函数,任何帮助都会非常有用。**

0 个答案:

没有答案