我正在使用BeautifulSoup,重定向后如何获取链接?

时间:2019-01-16 07:34:26

标签: python beautifulsoup

我想在文章页面中的下载链接重定向后获得该链接。

例如: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

在上面的文章页面中,有以下下载链接: https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=

直接打开此链接,它不会重定向到实际的下载链接,您需要在文章页面中将其打开。

# coding=utf-8

import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen


def urlopen(url):
    '''
    using requests to replace urllib.requests.urlopen
    return an html
    '''
    headers = {"User-Agent":"Mozilla/5.0"}
    r = requests.get(url, headers=headers)
    return r.text

def generate_pages(subTitle,fromPage,toPage):
    '''
    return  page sites' url list
    '''
    pages = []
    if(fromPage > 0 and fromPage<toPage):
        for i in range(fromPage,toPage+1):
            pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
    return pages



def get_book_sites_of_one_page(page):
    '''
    get book site's url in one page
    input: page site url
    output: book site urls list
    return book sites in one page
    '''
    html = urlopen(page)
    soup = BeautifulSoup(html,'html.parser')
    linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
    bookSites= []
    for link in linkList[::2]:
        if 'href' in link.attrs:
            #print(link)
            bookSites.append(link.attrs['href'])
    return bookSites


def get_book_urls(bookSite):
    '''
    input a book site
    find book downloading urls in this book site
    then
    return them as a list
    '''
    bookURLs=[]
    html = urlopen(bookSite)
    soup = BeautifulSoup(html,'lxml')
    linkList = soup.findAll("a",{"target":"_blank"})
    for link in linkList[::2]:
        # print(link)
        if 'href' in link.attrs:
            bookURLs.append(link.attrs['href'])
    return bookURLs


def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
    bookSites = []
    bookURLs = []
    pages = generate_pages(subTitle,fromPage, toPage)

    for page in pages:
        bookSiteOfOnePage=get_book_sites_of_one_page(page)
        bookSites.extend(bookSiteOfOnePage)

    for bookSite in bookSites:
        book_urls=get_book_urls(bookSite)
        bookURLs += book_urls

    for bookURL in bookURLs:
        print(bookURL)

    #with open(filename, 'w') as f:
    #    f.write(bookURLs)


def main():
    if(len(sys.argv) == 4):
        '''
        python getUrl.py 1, 100, programming
        from page 1 to page in subject programming
        '''
        subTitle = str(sys.argv[3])
        fromPage = int(sys.argv[1])
        toPage = int(sys.argv[2])
        get_all_book_urls(fromPage, toPage, subTitle)

    if(len(sys.argv) == 3):
        '''
        python getUrl.py 1 100
        from page 1 to page 100
        '''
        subTitle = ''
        fromPage = int(sys.argv[1])
        toPage = int(sys.argv[2])
        #filename = subTitle="-"+str(pageNum)+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)

    elif(len(sys.argv) == 2):
        '''
        python getUrl.py 10
        from page 10 to page 10
        only download books on page 10
        '''
        fromPage = int(sys.argv[1])
        toPage = fromPage + 1
        subTitle = ''
        #filename = "All-"+str(pageNum)+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)

    elif(len(sys.argv)== 1):
        fromPage = 1
        # custom page range
        toPage = 2
        subTitle = ''

        #filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
        get_all_book_urls(fromPage, toPage, subTitle)
    else:
        print("Error, too many arguments")



if __name__ == '__main__':

    #filename = ''
    main()

谢谢您的帮助!

1 个答案:

答案 0 :(得分:1)

此网站检查重定向时是否设置了引荐来源。您只需在标题中提供原始url作为引荐来源网址,即可轻松绕过该网址。您还可以在最终下载链接中看到引荐网址用作url参数。

import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url) 

输出

https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/