request_html阻止网站重定向

时间:2020-06-03 04:51:30

标签: python web web-scraping screen-scraping

我正在尝试使用python / requests_html抓取以下链接https://9anime.to/watch/one-piece-dub.34r/r2wjlq。 我的问题是它会自动重定向到默认的服务器选项卡,而不是mp4upload选项卡,试图为此找到一个解决方法,但无法弄清楚。 下面是代码

import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession

base_url = 'https://9anime.to'


class nine_scraper:

    def get_ep_links(url):
        html = nine_scraper.get_html(url, True)
        servers = html.find('div', id='servers-container')
        if servers:
            results = []
            mp4upload_results = []
            mp4upload = servers.find('div', attrs={'data-id': '35'})
            mp4upload_eps = mp4upload.find_all('a', href=True)
            for ep in mp4upload_eps:
                x = (ep.get('href'), ep.text)
                mp4upload_results.append(x)
            for result in mp4upload_results:
                results.append(base_url + result[0])
            return results
        else:
            print('No servers found!!')

    def get_series_info(url):
        return

    def get_servers(html):
        return

    def find_download(url):
        html = nine_scraper.get_html(url, True)

    def search(query):
        if '&page=' in query:
            query = query.split('&page=')
            search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
        else:
            search_url = base_url + '/search?keyword=' + parse.quote(query)
        html = nine_scraper.get_html(search_url, False)
        film_list = html.find('div', class_='film-list')
        if film_list:
            results = []
            prev_page = html.find('a', class_='pull-left')
            next_page = html.find('a', class_='pull-right')
            films = film_list.find_all('div', class_='inner')
            for film in films:
                results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
            if prev_page.get('href'):
                param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
                url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
                results.append(('Previous page', url))
            if next_page.get('href'):
                param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
                url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
                results.append(('Next page', url))
            return results
        else:
            print('No results found!')

    def get_html(url, render_js=False):  # Load webpage and return its html
        try:
            if render_js:  # Check if page needs to render javascript, if so use 'requests_html'
                session = HTMLSession()  # Make a GET request to your webpage, using 'Requests'
                resp = session.get(url, timeout=10)
                resp.raise_for_status()  # Raise an exception if respones doesnt come back 200-400
                resp.html.render(timeout=10)  # Render the javascript
                html = BeautifulSoup(resp.html.html, 'html.parser')  # Parse the html data we just got with 'BeautifulSoup4'
                return html  # Return the parsed html
            else:  # Use 'cloudscraper' since we dont need to load any javascript
                c_scraper = cloudscraper.create_scraper()  # Make a GET request to your webpage, using 'Requests'
                resp = c_scraper.get(url)
                resp.raise_for_status()  # Raise an exception if respones doesnt come back 200-400
                html = BeautifulSoup(resp.content, 'html.parser')  # Parse the html data we just got with 'BeautifulSoup4'
                return html  # Return the parsed html
        except requests.HTTPError as e:
            print(f'HTTP error occurred: {e}')
        except requests.ConnectionError as e:
            print(f'Connection Error occurred: {e}')
        except requests.Timeout as e:
            print(f'Timeout Error occurred: {e}')
        except requests.RequestException as e:
            print(f'General Error occurred: {e}')
        except Exception as e:
            print(f'Other error occurred: {e}')
        except KeyboardInterrupt:
            print("Someone closed the program")

import sys
from os import system, name
from scrapers import nine_scraper


def screen_clear():
    # for mac and linux(os.name is 'posix')
    if name == 'nt':
        _ = system('cls')
    else:
        _ = system('clear')


def main_menu():
    while True:
        screen_clear()
        print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
        main_choice = input('Enter your choice [1-3] >')
        if main_choice == '1':
            search_menu()
            break
        elif main_choice == '2':
            continue
        elif main_choice == '3':
            screen_clear()
            sys.exit()
        else:
            continue


def search_menu(query=False):
    screen_clear()
    print('--------------9anime downloader/search--------------\n')
    if query:
        search_results = nine_scraper.search(query)
        results_menu(search_results)
    else:
        query = input('Please enter the name of the anime >')
        if query:
            search_results = nine_scraper.search(query)
            results_menu(search_results)


def results_menu(results):
    for num, result in enumerate(results, 1):
        title = result[0]
        link = result[1]
        if 'Previous page' not in title:
            if 'Next page' in title:
                n = True
                print('[N] ' + title)
            else:
                print(f'[{num}] {title}')
        else:
            p = True
            print('[P] ' + title)
    print('[M] Main menu')
    titles, links = map(list, zip(*results))
    while True:
        search_choice = input('Enter choice >')
        try:
            search_choice = int(search_choice)
            if 1 <= search_choice <= len(results) + 1:
                print(links[search_choice - 1])
                print(titles[search_choice - 1])
                ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
                for link in ep_links:
                    print(link)
                    nine_scraper.find_download(link)
                # series_menu(links[search_choice - 1])
                break
        except ValueError:
            if search_choice.lower() == 'm':
                main_menu()
                break
            elif search_choice.lower() == 'p':
                if p:
                    url = links[-2]
                    search_menu(url)
                    break
                continue
            elif search_choice.lower() == 'n':
                if n:
                    url = links.pop()
                    search_menu(url)
                    break
                continue


def series_menu(url):
    info = nine_scraper.get_series_info()


main_menu()

我知道它必须是一些javascript才能重定向页面,但我想不出该怎么做才能阻止此操作,任何帮助将不胜感激!

0 个答案:

没有答案