试图通过asp.net登录并抓取一个网站

时间:2016-07-28 13:45:24

标签: python asp.net web-scraping beautifulsoup python-requests

我写了一个程序,目的是登录我公司的一个网站,然后抓取数据,以便更快地收集数据。这是使用请求和美丽的汤。

我可以打印出页面的html代码,但我无法通过aspx登录,然后在页面上打印html。

下面是使用的代码和我的标题和参数。任何帮助将不胜感激

if #available(iOS 8.0, *)
{

    if application.respondsToSelector("isRegisteredForRemoteNotifications")
    {

        let types:UIUserNotificationType = ([.Alert, .Sound, .Badge])

        let settings:UIUserNotificationSettings = UIUserNotificationSettings(forTypes: types, categories: nil)

        application.registerUserNotificationSettings(settings)
        application.registerForRemoteNotifications()
    }

}
else{
        let types: UIRemoteNotificationType = [.Alert, .Badge, .Sound]
        application.registerForRemoteNotificationTypes(types)
}

FROM DATA

import requests
from bs4 import BeautifulSoup

URL="http://mycompanywebsiteloginpage.co.uk/Login.aspx"
headers={"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0 Iceweasel/44.0.2"}

username="myusername"
password="mypassword"

s=requests.Session()
s.headers.update(headers)
r=s.get(URL)
soup=BeautifulSoup(r.content)

VIEWSTATE=soup.find(id="__VIEWSTATE")['value']
EVENTVALIDATION=soup.find(id="__EVENTVALIDATION")['value']
EVENTTARGET=soup.find(id="__EVENTTARGET")['value']
EVENTARGUEMENT=soup.find(id="__EVENTARGUMENT")['value']

login_data={"__VIEWSTATE":VIEWSTATE,
"ctl00$ContentPlaceHolder1$_tbEngineerUsername":username,
"ctl00$ContentPlaceHolder1$_tbEngineerPassword":password,
"ctl00$ContentPlaceHolder1$_tbSiteOwnerEmail":"",
"ctl00$ContentPlaceHolder1$_tbSiteOwnerPassword":"",
"ctl00$ContentPlaceHolder1$tbAdminName":username,
"ctl00$ContentPlaceHolder1$tbAdminPassword":password,
"__EVENTVALIDATION":EVENTVALIDATION,
"__EVENTTARGET":EVENTTARGET,
"--EVENTARGUEMENT":EVENTARGUEMENT}

r = s.post(URL, data=login_data)
r = requests.get("http://mycompanywebsitespageafterthelogin.co.uk/Secure/")
print (r.url)
print (r.text)

请求COOKIES

__VIEWSTATE:"DAwNEAIAAA4BBQAOAQ0QAgAADgEFAw4BDRACDwEBBm9ubG9hZAFkU2hvd1BhbmVsKCdjdGwwMF9Db250ZW50UGxhY2VIb2xkZXIxX19wbkFkbWluaXN0cmF0b3JzJywgZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoJ2FkbWluTG9naW5MaW5rJykpOwAOAQUBDgENEAIAAA4DBQEFBwULDgMNEAIMDwEBDUFsdGVybmF0ZVRleHQBDldEU0kgRGFzaGJvYXJkAAAAAA0QAgAADgIFAAUBDgINEAIPAQEEVGV4dAEEV0RTSQAAAA0QAgwPAQEHVmlzaWJsZQgAAAAADRACDwECBAABBFdEU2kAAAAAAABCX8QugS7ztoUJMfDmZ0s20ZNQfQ=="
ctl00$ContentPlaceHolder1$_tbEngineerUsername:"myusername"
ctl00$ContentPlaceHolder1$_tbEngineerPassword:"mypassword"
ctl00$ContentPlaceHolder1$_tbSiteOwnerEmail:""
ctl00$ContentPlaceHolder1$_tbSiteOwnerPassword:""
ctl00$ContentPlaceHolder1$tbAdminName:"myusername"
ctl00$ContentPlaceHolder1$tbAdminPassword:"mypassword"
__EVENTVALIDATION:"HQABAAAA/////wEAAAAAAAAADwEAAAAKAAAACBzHEFXh+HCtf3vdl8crWr6QZnmaeK7pMzThEoU2hwqJxnlkQDX2XLkLAOuKEnW/qBMtNK2cdpQgNxoGtq65"
__EVENTTARGET:"ctl00$ContentPlaceHolder1$_btAdminLogin"
__EVENTARGUMENT:""

响应标题

ASP.NET_SessionId:"11513CDDE31AF267CCD87BAB"

请求标题

Cache-Control:"private"
Connection:"Keep-Alive"
Content-Length:"123"
Content-Type:"text/html; charset=utf-8"
Date:"Thu, 28 Jul 2016 13:37:45 GMT"
Keep-Alive:"timeout=15, max=91"
Location:"/Secure/"
Server:"Apache/2.2.14 (Ubuntu)"
x-aspnet-version:"2.0.50727"

1 个答案:

答案 0 :(得分:3)

更改行

r = requests.get("http://mycompanywebsitespageafterthelogin.co.uk/Secure/")

使用会话对象

r = s.get("http://mycompanywebsitespageafterthelogin.co.uk/Secure/")