我正在尝试网络抓取此网站http://uprera.azurewebsites.net/View_projects.aspx
如何使用本网站: 从下拉列表中选择任何值,然后单击搜索,您将看到table.In表单击,在视图详细信息上按ctrl。它打开了一个新窗口,我试图为每个下拉值的webscrape链接
当我运行以下代码时,它会废弃网站,但会给我上面提到的网站网址而不是我正在寻找的链接
下面是我的代码:
import requests
from bs4 import BeautifulSoup
import csv
import time
final_data = []
url = "http://uprera.azurewebsites.net/View_projects.aspx"
response = requests.get(url).text
soup = BeautifulSoup(response,"html.parser")
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
for title in soup.select("#ContentPlaceHolder1_DdlprojectDistrict [value]")[:-1]:
search_item = title.text
# print(search_item)
headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Content-Type':'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
formfields = {'__VIEWSTATE':VIEWSTATE, #Put the value in this variable
'__VIEWSTATEGENERATOR':'4F1A7E70',
'__EVENTVALIDATION':EVENTVALIDATION, #Put the value in this variable
'ctl00$ContentPlaceHolder1$DdlprojectDistrict':search_item, #this is where your city name changes in each iteration
'ctl00$ContentPlaceHolder1$txtProject':'',
'ctl00$ContentPlaceHolder1$btnSearch':'Search'}
#here in form details check agra , i am able to scrape one city only,
# how to loop for all cities
s= requests.session()
res = s.post(url, data=formfields, headers=headers).text
soup = BeautifulSoup(res, "html.parser")
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
get_list = soup.find_all('option') #gets list of all <option> tag
for element in get_list :
cities = element["value"]
#final.append(cities)
#print(final)
get_details = soup.find_all("table", attrs={"id":"ContentPlaceHolder1_GridView1"})
for details in get_details:
text = details.find_all("tr")[1:]
count = 0
for tds in text:
td = tds.find_all("td")[1]
rera = td.find_all("span")
rnumber = ""
for num in rera:
rnumber = num.text
sublist = []
sublist.append(rnumber)
name = tds.find_all("td")[2]
prj_name = name.find_all("span")
prj = ""
for prjname in prj_name:
prj = prjname.text
sublist.append(prj)
promoter_name = tds.find_all("td")[3]
promoter = promoter_name.find_all("span")
prom = ""
for promname in promoter:
prom = promname.text
sublist.append(prom)
district = tds.find_all("td")[4]
dist = district.find_all("span")
district_name = ""
for districtname in dist:
district_name = districtname.text
sublist.append(district_name)
project_type = tds.find_all("td")[5]
project = project_type.find_all("span")
btn_td = tds.find_all("td")[6]
ip_name = btn_td.find("input").attrs['name']
dct = {}
dct['__VIEWSTATE']=VIEWSTATE
dct['__VIEWSTATEGENERATOR']=formfields['__VIEWSTATEGENERATOR']
dct['__EVENTVALIDATION']=EVENTVALIDATION
dct['ctl00$ContentPlaceHolder1$txtProject'] = ''
dct['ctl00$ContentPlaceHolder1$DdlprojectDistrict'] = formfields['ctl00$ContentPlaceHolder1$DdlprojectDistrict']
dct[ip_name+'binid'] = '6869'
dct[ip_name+'hfFlag'] = 'edit'
dct[ip_name+'ddlPRJ'] = 'Agra'
dct[ip_name+'txtPRJ'] = ''
resp = s.post(url, data=dct, headers=headers)
projectype = ""
for prjtype in project:
projectype = prjtype.text
sublist.append(projectype)
print( resp.url )
sublist.append( resp.url )
final_data.append(sublist)
count += 1
print(count)
filename = "UP_RERA.csv"
with open("./"+filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
csvfile.writerow("")
for i in range(0, len(final_data)):
csvfile.writerow(final_data[i])
任何人都可以帮助我。我正在解析正确的网址
答案 0 :(得分:0)
网址是由JavaScript创建的 - 但这些网址似乎具有相同的架构
Sub DeleteBadRows()
Dim i As Variant
Dim RowNbr As Long
Dim ColNbr As Long
Dim BadChr() As Variant
Dim LR As Long
BadChr = Array("=", "*", ",FEE", "DATE 12/13", ",(", "SMSLIST O", "REQUEST T", "WHERE", "SVC") 'include any characters to trigger deletion of row
LR = Cells.Find("*", searchorder:=xlByRows, searchdirection:=xlPrevious).Row
For RowNbr = LR To 1 Step -1
For ColNbr = 1 To Cells.Find("*", searchorder:=xlByRows, searchdirection:=xlPrevious).Column
For i = LBound(BadChr) To UBound(BadChr)
If InStr(Cells(RowNbr, ColNbr), BadChr(i)) Then
Cells(i).EntireRow.Delete
Exit For
End If
Next i
Next ColNbr
Next RowNbr
http://uprera.azurewebsites.net/View_Registration_Details.aspx?binid=10996&hfFlag=edit&ddlPRJ=Lucknow&txtPRJ=
- district
- 来自Lucknow
- rare
的数字 - 因此您可以手动生成它。
包含更改的代码:
对于测试,我使用10996
限制屏幕上的数据。
if count > 3: break