我遵循了一些在线指南,试图构建一个脚本,该脚本可以识别和下载网站上的所有pdf文件,以免我手动执行。到目前为止,这是我的代码:
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
url_list.append(("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href']))
#print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", "").replace(".pdf",""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
该代码似乎可以找到所有pdf(取消注释print(url_list)
即可看到)。但是,它在下载阶段失败。特别是出现此错误,我无法理解出了什么问题:
E:\webscraping>python get_pdfs.py
http://www.gatsby.ucl.ac.uk/teaching/courses/http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/cribsheet.pdf
E:\webscraping\http://www.gatsby.ucl.ac.uk/teaching/courses/cribsheet
Traceback (most recent call last):
File "get_pdfs.py", line 26, in <module>
request.urlretrieve(url, fullfilename)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\User\Anaconda3\envs\snake\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
有人可以帮我吗?
答案 0 :(得分:3)
检查以下实现。我使用requests
模块而不是urllib
进行下载。此外,为了避免使用.select()
,我使用了.find_all()
方法而不是re
。
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
#If there is no such folder, the script will create one automatically
folder_location = r'E:\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
response = requests.get(url)
soup= BeautifulSoup(response.text, "html.parser")
for link in soup.select("a[href$='.pdf']"):
#Name the pdf files using the last portion of each link which are unique in this case
filename = os.path.join(folder_location,link['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get(urljoin(url,link['href'])).content)
答案 1 :(得分:3)
通常,以上答案应该有效。但是,您应该评估要使用的网页的html源。例如,有些可能在meta标记中具有og_url属性,而另一些可能没有。如果您使用的是安全网站(例如您大学的课程网页),则可以这样做。在这种情况下,您将不得不以不同的方式提取pdf链接。
您可以在此处找到很好的解释和解决方案:
答案 2 :(得分:0)
对链接,其中已经包含导致404错误的服务器地址。另外,您不应该从文件名中删除.pdf
,因为它将保存不带扩展名的文件。
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
# connect to website and get list of all pdfs
url="http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"
response = request.urlopen(url).read()
soup= BeautifulSoup(response, "html.parser")
links = soup.find_all('a', href=re.compile(r'(.pdf)'))
# clean the pdf link names
url_list = []
for el in links:
if(el['href'].startswith('http')):
url_list.append(el['href'])
else:
url_list.append("http://www.gatsby.ucl.ac.uk/teaching/courses/" + el['href'])
print(url_list)
# download the pdfs to a specified location
for url in url_list:
print(url)
fullfilename = os.path.join('E:\webscraping', url.replace("http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016/", ""))
print(fullfilename)
request.urlretrieve(url, fullfilename)
答案 3 :(得分:0)
我根据 @SIM's answer 和附加的 argparse
编写了一个小说脚本。我的完整代码如下:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import argparse
#%% Example
# one pdf
# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here
# many pdfs
# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html
#%% Functions
def all_pdf_download(args):
base_url = args.link
if args.save_here:
folder_path = os.getcwd()
else:
folder_path = args.folder_path
if not os.path.exists(args.folder_path):os.mkdir(args.folder_path)
print("====== 1. Set savepath: {} ======".format(folder_path))
print("====== 2. Start searching ======")
#response = requests.get(base_url)
response = requests.get(base_url, headers={'User-Agent': 'Custom'})
soup= BeautifulSoup(response.text, "html.parser")
search_res = soup.select("a[href$='.pdf']")
print("{} files found!!!".format(len(search_res)))
print("====== 3. Start downloading ======")
for counter, link in enumerate(search_res):
#Name the pdf files using the last portion of each link which are unique in this case
filename = link['href'].split('/')[-1]
file_save_path = os.path.join(folder_path,link['href'].split('/')[-1])
if args.print_all:
print("[{}/{}] {}".format(counter+1, len(search_res), filename))
with open(file_save_path, 'wb') as f:
f.write(requests.get(urljoin(base_url,link['href'])).content)
print("====== 4. Finished!!! ======")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Test argparse')
####################################
############ ALL OPTION ############
## Main option
# -l/--link
parser.add_argument('-l', '--link', required=True, type=str,
help='write down site name')
# --print-all
parser.add_argument('--print-all', dest='print_all', action='store_true',
help="print all filename")
parser.set_defaults(print_all=True)
# --save-here
parser.add_argument('--save-here', dest='save_here', action='store_true',
help="save files here")
parser.set_defaults(save_here=False)
# --save--folder
# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~'))
parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"),
type=str, help='save files in the given folder')
########################################
############ PARSING OPTION ############
args = parser.parse_args()
all_pdf_download(args)
更多详情和更新,可以参考我的gist-hibetterheyj/all_pdf_dl.py
最好!