我有一个代码,可以从指定的网页https://webpage.com/products/waste-water/
下载PDF文件。在此页面上,有许多链接的格式为https://webpage.com/product/
,在每个页面上都有PDF文件。
如何添加功能以继续进入链接格式为https://webpage.com/product/
的每个子页面并从那里下载PDF文件?
我当前的代码:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://webpage.com/products/waste-water/"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\temp\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
编辑:
这是链接
https://www.nordicwater.com/products/waste-water/
答案 0 :(得分:1)
import requests
from bs4 import BeautifulSoup
main = "https://www.nordicwater.com/products/waste-water/"
def Get_Links():
r = requests.get(main).text
soup = BeautifulSoup(r, 'html.parser')
links = []
for item in soup.findAll("a", {'class': 'ap-area-link'}):
links.append(item.get("href"))
return links
def Parse_Links():
pdf = set()
for url in Get_Links():
r = requests.get(url).text
soup = BeautifulSoup(r, 'html.parser')
for item in soup.findAll("div", {'class': 'dl-items'}):
for link in item.findAll("a"):
link = link.get("href")
if link:
pdf.add(link)
return pdf
def Save():
for item in Parse_Links():
print(f"Downloading File: {item[55:]}")
r = requests.get(item)
with open(f"{item[55:]}", 'wb') as f:
f.write(r.content)
print("done")
Save()
输出: