用漂亮的汤报废ASP.NET网站

时间:2019-07-03 06:49:57

标签: asp.net web-scraping python-3.7

我正在尝试用请求废弃站点ASP.NET。第一步是登录,没关系,登录后我需要保留会话以从页面获取数据。

我通过request.cookies.RequestsCookieJar()获取cookie,在其他站点中此代码有效,但是在ASP.NET中开发的此站点中无效。我研究并找到了有关__VIEWSTATE和其他参数的信息,但我不知道。

#!/usr/bin/env python3
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings()

from time import sleep

class getData():
    def __init__(self, url):
        self.url = url
        self.session = requests.Session()

def getSelectorRecaptcha(self):
    page = requests.get(self.url, verify=False)
    soup = BeautifulSoup(page.text, 'html.parser')
    recaptcha = soup.find(class_='g-recaptcha')
    value = recaptcha.attrs
    return(value['data-sitekey'])

def solveRecaptcha(self, API_KEY):
    site_key = self.getSelectorRecaptcha()
    captcha_id = self.session.post("http://2captcha.com/in.php?key={}&method=userrecaptcha&googlekey={}&pageurl={}".format(API_KEY, site_key, self.url)).text.split('|')[1]

    recaptcha_answer = self.session.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id)).text
    while 'CAPCHA_NOT_READY' in recaptcha_answer:
        sleep(5)
        recaptcha_answer = self.session.get("http://2captcha.com/res.php?key={}&action=get&id={}".format(API_KEY, captcha_id)).text

    if(recaptcha_answer == 'ERROR_WRONG_CAPTCHA_ID'):
        badreport = self.session.post("https://2captcha.com/res.php?key={}&action=reportbad&id={}".format(API_KEY, captcha_id)).text

    if (recaptcha_answer != 'ERROR_WRONG_CAPTCHA_ID'):
        recaptcha_answer = recaptcha_answer.split('|')[1]


        payload = {
            'key': site_key,
            'g-recaptcha-response': recaptcha_answer  
            }

        response = self.session.post(self.url, payload, verify=False)


def doLogin(self):
    login_data =  {'login':'xxxxxxxx', 'password':'xxxxxxx'}
    self.solveRecaptcha('xxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    response = self.session.post(self.url, login_data, verify=False) #login

    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup.select_one('__VIEWSTATE'))
    print(soup.select_one('__VIEWSTATEGENERATOR'))
    print(soup.select_one('__EVENTARGUMENT'))
    print(soup.select_one('__EVENTVALIDATION'))

    #get cookies
    jar = requests.cookies.RequestsCookieJar()
    c = self.session.cookies.get_dict()
    for i in self.session.cookies.get_dict():
        jar.set(i, c[i])

    self.session.cookies = jar

    page = self.session.get('private url', verify=False)
    soup = BeautifulSoup(page.text, 'html.parser')
    print(page.status_code)

    print(soup.find(id='some id'))






def main():

    obj = getData('private url')
    obj.doLogin()


if __name__ == "__main__":
    main()

https://www.dell.com/中也会发生此问题,我无法传递URL,但是正在Dell站点中进行测试。我希望在登录后获取数据。拜托,有人可以帮我吗?

0 个答案:

没有答案