无法使用python刮掉ASPX网页?

时间:2018-04-03 13:14:33

标签: python http web-scraping

声明: - 我正在尝试废弃该网址。我试图在post请求后使用代码中设置的参数获取数据。

问题: - 我实际上正在获取原始.aspx页面的html以及我在' formFields'中设置的参数。没有设定。任何人都可以解释我的错误。

import urllib
import urllib2

uri = 'http://cbseaff.nic.in/cbse_aff/schdir_Report/userview.aspx'

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Content-Type': 'text/html; charset=utf-8'
}

formFields = (
        (r'__VIEWSTATE',r'yE41BXTcCz0TfHmpBjw1lXLQe3PFLLTPmxSVU0hSeOt6wKGfqvehmJiLNmLj2VdBWrbXtmUIr0bh8lgw8X8f/yn9T1D4zmny5nUAc5HpgouURKeWzvZasUpmJUJ8pgh4jTUo62EVRQCYkXKayQVbCwaSP81BxDO9gxrERvzCUlw8i76A4jzlleSSunjr844drlsOw/PxjgYeRZCLm/h8WAc5HZrJ+w7vLMyLxlY/mDQaYdkVAF/s4lAJAxGfnX1rlshirdphBhI1tZIuoJa+ZTNzizgrXi70PVnAR3cw0QhCWr2rrTkrvoJ+rI5pme0pYPAX+CZfmSH3Cg1BKEbY/+G+p1AsLRqsobC8EBQXHPicqnhgOR7/kx+Z54XyCzxDwXCZBFKl3npcSq4xJ2Ebi3PFS6FtT0K+wZTZ8XGCQUVudzKyqhfDxQ4UTpDWn4vR7LIF765qhXpRmNG6HCzbvgrLqNrBzt+PZ0mbpeLsIWEia5L/AIbN+zcLiNsBxTBA9zOsljZDPkbL1rWo+WDUwBfDRiu6X4ru+RKypczFFzoUUciCaWD2ciOq+//NYa7NEwZ9d7YRY/LfEhszYUJO72Xpzaotxu7+7RdGVvsxzrh1Ro8OHqoskesX6QlEdqjakgrk3yqaroXCfnnBJu1ExKYXrU6JuVFbOGz66CZfkqiJJSVHs2Ozfos/yPWWiJVyETKtPQF003o9GGKKIFIaQ6YRHoco3jVOKB+NIE/VxMJHiSzPuuExZ27MY/pX9U9rx5/4az6i/LIqYoRzQilJT7lN5eTIdVn/W5Uep0tvKtOKhR7JjX7WhO7QpROlOh7H/wbCdr5u/jXB5FpJLB1Y8oZSdVyKJK8Xx+tODFvxYxMve5NT+/fBwlMSfkh7+h7+5Zo5hHKwLF01mrS52KHehix4Ja5bQ3hy6Q2r6pz+ZVHvUsev9OpsZlO1+PPjVCbdjzeUX23At6R4BRm6Zq/o0knl2Xf/kvM6WtKtDi+AbpIa7Eyj+brqWMdDKW4AiOLGS45H4ERP2z1CeVwiXlfa22JhkPbz8GJf9J9iey+JefnmsgD5ScdcvPU1mb1i7NLv9QOYXZXZwJXhGHZcw65iQm7vrZB5sJlBp7agBhhwX2KNaKZguNGVGhlxiS84zoKrkvdBv7e52n6H9p3okMvqHR+yEe+UCuDPanO+9tTvNvOqBzEAVvPYIK80YWsuDY3R66OBPjQEKpbPrDpz5hoMKk59r1FiIq6Jd71c6QeE57Au3ei72GZEVzaLXAva0RJP/tSROnO7ZKHkvxuP0oayVlzjLuIHnO0o4zUsoHpTJqPa20Bxv9De3JwOOi8HJgYj+dZjdRIDT9zHhgbLV9UO4z0HHyK54RIS67OAS8WqMYyfdC5I5GGwy8rosVKNjCfHymMEUzbs5iHCPhrM/X0UMJTxQ7yq113/6U43p6BP4PqP/OAgRYxejrVtT9goPKWxHTwu0kDROXCVvqHo5SiQ+/X3DdTxLF+13p0k5xlXBk0qkeLJkNlSYBeTOgPyvjHxnSMUdjhjHtiA0oFCSSCYCpVU9Pe3PLQyyUjv+KhI/jWS94D3KxYqXjyHUC/nMxEwot65kzFE/scAoOsdk/MJS/uZw4PbfbGEVKWTcJLtOV8s3wHKPzmB/AexZ//iEmDv'),
        (r'__VIEWSTATEGENERATOR','AC872EDD'),
        (r'__EVENTVALIDATION',r'35e8G73FpRBmb1p/I2BD+iKRsX5vU8sS0eAo0dx/dcp4JGg0Jp+2ZLyg8GHhCmrVcGjQVBABi8oOAgXt+2OghWX0FyVvk0fo8XPTIL/uMJjSBFrRvgsQNnh/Ue1VCOGhBgwhmrBt+VbjmmkA3hafVR2lAVjy/RYswgFcmyloWPyxQDu9UuktSiCNfyfJcF/xZrJDTeHBkNmB9P8QGNHLVnoaZjsmks/yB+Nll5ZLiq0WvDRhsbq1I/rrRWytnleHAMNutsEqdU9OvycQ/insfM871Jww+TyIvmxEVRYcAH6MnYl0nqUaderHN4R37aH8CH8B/NUxGDYLSdlsvJGJxXEZh9EVzzDprJuz7sJUxolBhjv6YNfJDAThrLCip2QEY20SztPZ/j8KnWgnX7Xs6sxjZofKnQxNlB44dZG0okIPitb9zjWB6kC2xDmN69vfDayDOCkcPJG3q/HMP6ewQvV/YheqUbuBwC77WPIKXrc='),
        (r'ctl00$ContentPlaceHolder1$optlist', 'State+Wise'),
        (r'ctl00$ContentPlaceHolder1$ddlitem' , '22'),
        (r'ctl00$ContentPlaceHolder1$optlistChoice', 'List+of+All+Schools'),
        (r'ctl00$ContentPlaceHolder1$search', 'Search'),
        (r'__EVENTTARGET','ctl00$ContentPlaceHolder1$search')
)

encodedFields = urllib.urlencode(formFields)

req = urllib2.Request(uri, encodedFields, headers)
f= urllib2.urlopen(req)

try:
  fout = open('temp.htm', 'w')
except:
  print('Could not open output file\n')

fout.writelines(f.readlines())
fout.close()

1 个答案:

答案 0 :(得分:0)

最简单的方法是切换到requests库并尝试下面的内容。只有当你没有时间编码并且匆忙时,你才应该采用以下方法。当您从下拉列表中选择ASSAM并按搜索按钮时,您可以在网站上看到相同的结果。

import requests
from bs4 import BeautifulSoup

URL = "http://cbseaff.nic.in/cbse_aff/schdir_Report/userview.aspx"
payload="__EVENTTARGET=&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE=sR0epHdr4qQ9SeEuqmUUBWzZHpFAyMj5Xr%2BhGagE8Be7zDbBqQgdx1%2BqrCMY8JfcRJ%2BBhmH8eEKMS%2F3VeboGFTjN0LZz1p2mx3FfY2OVkzs353bmDLWhtvBlOVnyFgcCXGyI9Li3Wp22e5txxKQwtdrKTWheuZLatRvI9wztvyeGueD9ZmEl8gIQHT77fIyt3N%2Bi3dn%2FhUdvvi%2FRqHR%2FaE1YfW3RmmSECDepwYfmBlSH3e4zTDQskW6XoOD7jykryp8L845QWclBE37ttOc5zXndfqkE%2FKCwOxiCxzRVPWCIFD9mBfELERlDOzq0cKwg6hHcE%2B4ZgyxJKBwNbW%2FDBtl%2BzpATY8arhjAg0CCf5CGmYMG1w2aaYz%2FSQ6bKQYWAaiH%2B0t9lrV291vPNj5Gfy1rWwrUhmw%2F6LcvX7uof41OUHtfGt%2Bn1IlxODsPu8i89hjiu5rqprmltTKSzoOZXJuQ1sfyRneShA7WeSIO0M4UmzUVdnjsNlimBmcc7vwAgvF8E6SJ8DaGCbPaLLH6tsI%2BISTBWPWw97o7KQyS%2FPicpZQA2Q5kzCHkAeroJWWrVILlCxEAEuOmzZffW7qIiiUwI3Urbixw40YDkjMPgVZMbb74EiEvmMGr1vNOMcjoICgM54sY5cViGP945D86fRlbS%2B6yYxNH951EZNrmRWnASiT2D%2FEbGMAkOwcYKrOWBrtMXEp8GPsswxryCZp2eANe7ajSioa0utT2cmGG5T7uiypveyfM0LBzgVTMqr5q7oMyUI9ML%2Bx8LHUFSh6SrjP1Nj7xs3%2BFPr2WaXoIs57hhUKfpE7u7Du9EGd1iXapf2WYKbvsl38Ov5u3FcPJ8L%2BLv%2FtksFE7Y7K2ZQ59p0Jqr%2BF%2FbMaKZeRRPp%2BMpW1TblAI8RVP1u%2BTDAHz19jrdpsralLWYoweP3wS5kdDcx68IukuaeGl0ESXcbNYWdFF0pEolfWLBSWcbOpR3YPpxVQD5Eqq3z%2FsYmk%2B%2B%2ByRXOH%2FizUrCqDqQjwbTw4UqRJY5HewB0q0Xp6jrPoT0%2FztPwteGCb6xonx1rzVkr84aOUw9IBUkCcvEtyi0S8VcUyVyii6KK9CThtyTHjbgut%2BdjuIOBlXPL9bD6u2frJiGGacVziPZInp6%2FtKcmnYn89Kcr0Ec1o3VZ6PD6xe8SUyMkupf%2FruNhbK6r6ZAXKZ3zMhkiDsMZYVGUihU43gNj3c8X%2BFD8ONb1827AfaDydgS1JxMtA9z4eOTqmoiPQ61vSr2j8IXokWJSDNg%2FhIn1uChG8BoEst2qZyVoQify0g%2FDMN%2B%2BPMMK7KPqnAfT4P17QI4Gn3mg2kFzfuQdnsQgs7aB0zAt0jrMgCoTDxuwbNvZ1w5BMF0bFbbfxi9QvbCXdpifAWbgAyutlo3wCbD1lIv3NmzLFQ61Mpih5zIOU2z9bpBeH4nClXcAN7QQVQIq8w%3D&__VIEWSTATEGENERATOR=AC872EDD&__VIEWSTATEENCRYPTED=&__EVENTVALIDATION=B7%2FH7uAZc%2FaW7NlReiB%2BrW3ac6DmQz82BOha1KMrmjcJR7%2F3qZByA7RNvmBBI36ivFrYkmYQ8SptltVOT45oBSWn4HG%2FRBbAWPHtcWY7qtl4okgXZ831Q1MTlxdIOkn2uPcoQOsostEzjJ8LVZHkcx%2FVjr6Fb1zImxNbSPDRDVJ1QLmYSCaf2RbJkzmP7ZiqR3w9MXn7GliipkRdhVHlJaDrh7eFy9zOjEcG2Ed2v0NxA5lnpnrXFcE42f9W%2BnLwNfUPR%2BiB95TtvY52ucsD5CgjWqlm9uMrDzHL1kl3WGzg6eU%2BIA9J744%2FRM2TD5JfhPykP6DB9E3E9%2BWzSSJowqwSzOwLNCjcbC%2BvBUb3GPXPwadz%2Fg3pEGTiBWtqdBCeOUiKnkDeDOrno8fS1RPu%2BVx%2F6M1LGWddW2CBUa8m3CizqMfLTGP7HVj4VpnSU0fttCuY26UTZzzMmplPmCjZziEJHd%2F5jc%2Byyf517tk%2BfHA%3D&ctl00%24ContentPlaceHolder1%24optlist=State+Wise&ctl00%24ContentPlaceHolder1%24ddlitem=2+&ctl00%24ContentPlaceHolder1%24optlistChoice=List+of+All+Schools&ctl00%24ContentPlaceHolder1%24search=Search"

with requests.Session() as s:
    s.headers={"User-Agent":"Mozilla/5.0"}
    s.headers.update({'Content-Type': 'application/x-www-form-urlencoded'})
    res = s.post(URL,data=payload)
    soup = BeautifulSoup(res.text,"lxml")
    for item in soup.select("#hyp,#Hyperlink1"):
        print(item.text)

部分结果:

Abanti Kumar Mahanta Educational Institution
adarsh vidyalaya
adarsh vidyalaya
adarsh vidyalaya dahalapara