https://repo1.maven.org/maven2/
此文件夹包含许多子目录和文件。我只想使用Python下载maven-metadata.xml
文件。我尝试了answer,但它没有递归遍历子目录。
答案 0 :(得分:1)
我也建议您使用漂亮的汤..您可以做这样的事情,而我的测试(如果它是目录)非常非常简单(只要链接以'/'开头)
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
import requests
def isDirectory(url):
if(url.endswith('/')):
return True
else:
return False
def findLinks(url):
page = requests.get(url).content
bsObj = BeautifulSoup(page, 'html.parser')
maybe_directories = bsObj.findAll('a', href=True)
for link in maybe_directories:
print(link['href'])
print(isDirectory(link['href']))
if(isDirectory(link['href'])):
newUrl = url + link['href']
findLinks(newUrl) #recursion happening here
else:
if(link['href'].endswith('maven-metadata.xml')):
print("GOTCHA!") #now safe and download
startUrl = "https://repo1.maven.org/maven2/"
findLinks(startUrl)