我正在尝试通过抓取脚本访问pdf文件的链接。 (我的最终目标是下载PDF,但是,我正在学习,并且链接似乎是工作流程中的一个好步骤)对这些文件的访问受到限制,但我可以通过VPN访问。下载pdf的选项在我的浏览器中可用,但是当脚本遍历页面而没有返回任何指向pdf的链接时。其他数据,如文档标题,我可以得到,他们显示得很好。所以我假设我有标签工作的方式。这是请求链接到PDF或使用VPN访问的特定问题吗?我将从网站上共享脚本和HTML,因为如果没有VPN访问,您将无法获得后者。
import requests
from bs4 import BeautifulSoup
def elibrary_spider(max_pages):
page = 1
while page <= max_pages:
if page < 2:
url = \
'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals&redirect=true'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for first in soup.find_all('div', class_='s-pt-2'):
for second in first('div', class_= 'contentItem nlm-book hasAccess hasCover standardResult chunkResult hi-visible s-px-4 s-py-3 s-bdr-b l-flex l-flex-row'):
for titlelist in second('div', class_='downloadLinksModule closed'):
for ullink in titlelist('ul', class_='types'):
for link in ullink('li', class_='pdf'):
for pdf in link('a'):
href = 'http://www.elibrary.imf.org/' + pdf.get('href')
print(href)
page += 1
print(page)
#Subsequent pages have differend url
if page > 1:
url = \
'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&page=' + str(page) \
+ '&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for first in soup.find_all('div', class_='s-pt-2'):
for second in first('div', class_= 'contentItem nlm-book hasAccess hasCover standardResult chunkResult hi-visible s-px-4 s-py-3 s-bdr-b l-flex l-flex-row'):
for titlelist in second('div', class_='downloadLinksModule closed'):
for ullink in titlelist('ul', class_='types'):
for link in ullink('li', class_='pdf'):
for pdf in link('a'):
href = 'http://www.elibrary.imf.org/' + pdf.get('href')
print(href)
page += 1
print(page)
elibrary_spider(3)
网站的HTML是
这个实际上有效,它会导致下一页。所以条目soup.find_all('div',class _ ='s-pt-2'):应该是正确的,对吧? (对于这个,不需要VPN)
import requests
from bs4 import BeautifulSoup
#Crawl defined 'browse' page
def elibrary_spider(max_pages):
page = 1
while page <= max_pages:
if page < 2:
url = \
'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals&redirect=true'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for doclist in soup.find_all('div', class_='s-pt-2'):
for titlelist in doclist('h2', class_= 'itemTitle'):
for link in titlelist('a'):
href = 'http://www.elibrary.imf.org/' + link.get('href')
print(href)
get_publication(href)
page += 1
print(page)
#Subsequent pages have differend url
if page > 1:
url = \
'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&page=' + str(page) \
+ '&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals'
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for doclist in soup.find_all('div', class_='s-pt-2'):
for titlelist in doclist('h2', class_='itemTitle'):
for link in titlelist('a'):
href = 'http://www.elibrary.imf.org/' + link.get('href')
print(href)
get_publication(href)
page += 1
print(page)
elibrary_spider(3)