我要抓取一个有多个页面的网站,并使用像这样的javascript
<td><a href="javascript:__doPostBack('gv_AgentList1','Page$2')">2</a></td><td><a href="javascript:__doPostBack('gv_AgentList1','Page$3')">3</a></td>
在这里它们更改页面,并且这种效果反映在DOM中的__EVENTARGUMENT中,例如__EVENTARGUMENT:Page$2
我试图遍历它,但是多次收到相同的首页结果。谁能帮我。
下面是我的代码:
from bs4 import BeautifulSoup
import requests
import csv
import sqlite3
url = "https://rera.cgstate.gov.in/"
final_data = []
def getdatabyget(url,values):
res = requests.get(url,values)
text = res.text
return text
def readheaders():
global url, final_data
for i in range(1, 4):
argument = "Page$"+ str(i+1)
htmldata = getdatabyget(url, {})
soup = BeautifulSoup(htmldata, "html.parser")
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type':'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
formfields = {"__ASYNCPOST":"true",
"__EVENTARGUMENT":argument,
"__EVENTTARGET":"gv_AgentList1",
"__EVENTVALIDATION":EVENTVALIDATION,
"__LASTFOCUS":"",
"__VIEWSTATE":VIEWSTATE,
"ApplicantType":"0",
"Button1":"Search",
"color_value":"0",
"District_Name":"0",
"DropDownList1":"0",
"DropDownList2":"0",
"DropDownList4":"0",
"DropDownList5":"0",
"group1":"on",
"hdnSelectedOption":"0",
"hdnSelectedOptionForContractor":"0",
"language_value":"0",
"Mobile":"",
"Tehsil_Name":"0",
"TextBox1":"",
"TextBox2":"",
"TextBox3":"",
"TextBox4":"",
"TextBox5":"",
"TextBox6":"",
"ToolkitScriptManager1":"appr1|Button1",
"txt_otp":"",
"txt_proj_name":"",
"txtRefNo":"",
"txtRefNoForContractor":""}
s = requests.session()
res = s.post(url, data=formfields, headers=headers).text
soup = BeautifulSoup(res, "html.parser")
data = soup.find_all("table")[0]
gettr = data.find_all("tr")[1:-2]
for i in gettr:
add_list = []
blank = ""
projectname = i.find_all("td")[0].text
reranumber = i.find_all("td")[1].text.replace(" ","")
Authorised = i.find_all("td")[2].text.replace("\n","")
promoternme = i.find_all("td")[3].text.replace("\n","")
projecttype = i.find_all("td")[4].text.replace("\n","")
district = i.find_all("td")[5].text.replace("\n","")
tehsil = i.find_all("td")[6].text.replace("\n","")
approveddate = i.find_all("td")[7].text.replace("\n","")
enddate = i.find_all("td")[8].text.replace("\n","")
add_list.append(projectname)
print(add_list)
readheaders()
上面是代码。我该如何解决这个问题。请开导。