您好我想从以下网址下载链接的pdf文件:
https://arxiv.org/find/all/1/all:+5g/0/1/0/all/0/1?skip=0&query_id=32bdbf71e4007c69
是否有可用的Python3代码?任何帮助将不胜感激。
答案 0 :(得分:1)
以下代码对我有用:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://arxiv.org/find/all/1/all:+5g/0/1/0/all/0/1?skip=0&query_id=32bdbf71e4007c69'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
#print(soup)
for link in soup.select('a[href^="/pdf"]'):
href = link.get('href')
href1 = 'https://arxiv.org'+ href + '.pdf'
#print(href)
print(href)
print(href1)
if not any(href1.endswith(x) for x in ['.pdf']):
continue
filename = os.path.join(OUTPUT_DIR, href1.rsplit('/', 1)[-1])
# We need a https:// URL for this site
#href = href.replace('http://','https://')
print(filename)
print("Downloading %s to %s..." % (href1, filename) )
urlretrieve(href1, filename)
print("Done.")