Web抓python,不能调用某些标签

时间:2018-03-13 14:42:23

标签: python html web-scraping beautifulsoup vpn

我正在尝试通过抓取脚本访问pdf文件的链接。 (我的最终目标是下载PDF,但是,我正在学习,并且链接似乎是工作流程中的一个好步骤)对这些文件的访问受到限制,但我可以通过VPN访问。下载pdf的选项在我的浏览器中可用,但是当脚本遍历页面而没有返回任何指向pdf的链接时。其他数据,如文档标题,我可以得到,他们显示得很好。所以我假设我有标签工作的方式。这是请求链接到PDF或使用VPN访问的特定问题吗?我将从网站上共享脚本和HTML,因为如果没有VPN访问,您将无法获得后者。

import requests
from bs4 import BeautifulSoup


def elibrary_spider(max_pages):
page = 1
while page <= max_pages:
    if page < 2:
        url = \
            'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals&redirect=true'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for first in soup.find_all('div', class_='s-pt-2'):
            for second in first('div', class_= 'contentItem nlm-book hasAccess hasCover standardResult chunkResult hi-visible s-px-4 s-py-3 s-bdr-b l-flex l-flex-row'):
                for titlelist in second('div', class_='downloadLinksModule closed'):
                    for ullink in titlelist('ul', class_='types'):
                        for link in ullink('li', class_='pdf'):
                            for pdf in link('a'):
                                href = 'http://www.elibrary.imf.org/' + pdf.get('href')
                                print(href)
        page += 1
        print(page)
        #Subsequent pages have differend url
    if page > 1:
        url = \
            'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&page=' + str(page) \
            + '&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for first in soup.find_all('div', class_='s-pt-2'):
            for second in first('div', class_= 'contentItem nlm-book hasAccess hasCover standardResult chunkResult hi-visible s-px-4 s-py-3 s-bdr-b l-flex l-flex-row'):
                for titlelist in second('div', class_='downloadLinksModule closed'):
                    for ullink in titlelist('ul', class_='types'):
                        for link in ullink('li', class_='pdf'):
                            for pdf in link('a'):
                                href = 'http://www.elibrary.imf.org/' + pdf.get('href')
                                print(href)
        page += 1
        print(page)
  elibrary_spider(3)

- 编辑 -

网站的HTML是

HTML Tree

这个实际上有效,它会导致下一页。所以条目soup.find_all('div',class _ ='s-pt-2'):应该是正确的,对吧? (对于这个,不需要VPN)

import requests
from bs4 import BeautifulSoup


#Crawl defined 'browse' page
def elibrary_spider(max_pages):
page = 1
while page <= max_pages:
    if page < 2:
        url = \
            'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals&redirect=true'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for doclist in soup.find_all('div', class_='s-pt-2'):
            for titlelist in doclist('h2', class_= 'itemTitle'):
                for link in titlelist('a'):
                    href = 'http://www.elibrary.imf.org/' + link.get('href')
                    print(href)
                    get_publication(href)
        page += 1
        print(page)
        #Subsequent pages have differend url
    if page > 1:
        url = \
            'http://www.elibrary.imf.org/browse?freeFilter=false&fromDate=1986&page=' + str(page) \
            + '&pageSize=100&sort=date&t_8=urn%3Aeng&toDate=2000&type_0=booksandjournals'
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, "html.parser")
        for doclist in soup.find_all('div', class_='s-pt-2'):
            for titlelist in doclist('h2', class_='itemTitle'):
                for link in titlelist('a'):
                    href = 'http://www.elibrary.imf.org/' + link.get('href')
                    print(href)
                    get_publication(href)
        page += 1
        print(page)

elibrary_spider(3)

0 个答案:

没有答案