我正在尝试抓取一个aspx网站:https://www.aae.org/patients/find.aspx。为了测试目的,请使用33133作为zipcode& 100作为半径。
最初我通过迭代搜索页面来收集个人资料链接,我成功地在第一页获得了前20个链接,但无法超越第1页,消息来源说 - '我们已经抱歉,您找不到的页面或文件无法找到'
请参阅下面的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv
profile_links = []
def result_checker(self):
No_results = self.xpath('//td[@colspan="3"]//p//text()')
if "No results" in str(No_results):
print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
time.sleep(10)
sys.exit()
else:
pass
def Get_data(zipcode, radius):
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Host':'www.tcms.com',
'Origin':'https://www.aae.org',
'Referer':'https://www.aae.org/patients/find.aspx'}
class MyOpener(urllib.request.FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'
myopener = MyOpener()
url = 'https://www.aae.org/patients/find.aspx'
f = myopener.open(url)
soup = BeautifulSoup(f,'lxml')
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup.select("#EktronClientManager")[0]['value']
formData = (
('__EVENTVALIDATION', eventvalidation),
('__VIEWSTATE', viewstate),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
('EktronClientManager',EktronClientManager),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))
encodedFields = urllib.parse.urlencode(formData)
f1 = myopener.open(url, encodedFields)
source = f1.read()
target = open('sample.txt','w')
target.write(str(source))
target.close()
source1 = html.fromstring(source)
result_checker(source1)
links = source1.xpath("//table[@class='Results']//tr//a//@href")
for each in links:
if "MemberID" and "AddressID" in each:
print (each)
profile_links.append("https://www.aae.org/patients/"+str(each))
else:
pass
j = 2
soup2 = BeautifulSoup(source,'lxml')
viewstate = soup2.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
while j < 5:
pages = 'Page$'+str(j)
print (pages,'\n---------------')
formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
('__EVENTARGUMENT',pages),
('__VIEWSTATE',viewstate),
('__EVENTVALIDATION',eventvalidation),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))
encodedFields1 = urllib.parse.urlencode(formData1)
f2 = myopener.open(url, encodedFields1)
source2 = f2.read()
target = open('sample.txt','w')
target.write(str(source2))
target.close()
source3 = html.fromstring(source2)
links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
for each1 in links2:
if "MemberID" and "AddressID" in each1:
print (each1)
profile_links.append("https://www.aae.org/patients/"+str(each1))
else:
pass
soup3 = BeautifulSoup(source2,'lxml')
viewstate = soup3.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
j+=1
if __name__ == "__main__":
#Get_data('38132', 5)
Get_data('33133', 100)
答案 0 :(得分:0)
是Greg Sadetsky,你对cookie是绝对正确的,创建一个会话然后传递所有POST请求和所需的数据参数是必要的。
在Requests lib的帮助下,我能够创建一个存储可以跨请求使用的cookie的会话。
import requests
from bs4 import BeautifulSoup
from requests import Request, Session
from lxml import html
def Get_data(zipcode, radius):
All_links = []
url = 'https://www.aae.org/patients/find.aspx'
s = requests.Session()
r = s.get(url)
#print (r.text.encode('utf-8'))
soup = BeautifulSoup(r.content,'lxml')
viewstate = soup.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup.select("#EktronClientManager")[0]['value']
params = {'EktronClientManager':EktronClientManager,
'__VIEWSTATE':viewstate,
'__EVENTVALIDATION':eventvalidation,
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius,
'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode,
'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'}
r2 = s.post(url,data=params)
source = html.fromstring(r2.content)
links = source.xpath("//table[@class='Results']//tr//a//@href")
for each in links:
if "MemberID" and "AddressID" in each:
print (each)
All_links.append("https://www.aae.org/patients/"+str(each))
#print (r2.content)
soup1 = BeautifulSoup(r2.content,'lxml')
viewstate = soup1.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup1.select("#EktronClientManager")[0]['value']
j = 2
while j < 7:
page = 'Page$'+str(j)
print (page)
params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults',
'__EVENTARGUMENT':page,
'EktronClientManager':EktronClientManager,
'__VIEWSTATE':viewstate,
'__EVENTVALIDATION':eventvalidation,
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'}
r3 = s.post(url,data=params1)
source1 = html.fromstring(r3.content)
links1 = source1.xpath("//table[@class='Results']//tr//a//@href")
for each1 in links1:
if "MemberID" and "AddressID" in each1:
print (each1)
All_links.append("https://www.aae.org/patients/"+str(each1))
soup2 = BeautifulSoup(r3.content,'lxml')
viewstate = soup2.select("#__VIEWSTATE")[0]['value']
eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
EktronClientManager = soup2.select("#EktronClientManager")[0]['value']
j+=1
Get_data(33133, 100)