import requests
import os
from pythonds.graphs import Graph,Vertex,adjGraph
from bs4 import BeautifulSoup
root="https://www.yellowpages.com/los-angeles-ca/coffee-shops"
# Function to fetch the links contained in a Web Page
def fetchHref(link):
r = requests.get(link)
soup = BeautifulSoup(r.content, "html.parser")
links = soup.find_all("a")
linksList = []
for link in links:
if type(link.get("href")) is str:
if "http" in link.get("href"):
linksList.append(link.get("href"))
return linksList
#Function to make edges from a webpage to its corresponding links
def ev(g,f,t=None):
if g.__contains__(f):
g.addVertex(t)
g.addEdge(f, t, 1)
else:
g.addVertex(f)
#Makes a graph for a root webpage to its constituents upto a specified depth
def make_graph(root,depth=None):
#fun=0
terminate=0
g = Graph()
ev(g, root)
rootList=[]
tempList=[]
rootList.append(root)
while True:
for j in range(len(rootList)):
lst = fetchHref(rootList[j])
if not os.path.exists(lpath):
os.makedirs(lpath)
for ctr in range(len(lst)):
if rootList[j]!=lst[ctr]:
ev(g,rootList[j],lst[ctr])
tempList.append(lst[ctr])
rootList=[]
rootList=tempList
tempList=[]
terminate = terminate+1
if terminate == depth:
break
return g
g=make_graph(root,1) #Resultant Graph
**我的问题陈述是我需要下载与根网页相关的特定深度的网页,并将它们存储在目录结构中。
我需要将存储在图表中的所有链接保存在层级文件夹结构中。我很难建立递归函数,任何帮助都会非常有用。**