以下是我要做的事情:转到here,然后点击“搜索”。抓取数据,然后点击“下一步”,然后继续按下,直到你没有页面。 一切都达到了“下一步”的作用。这是我的代码。 r.content的格式与我打印它的两次完全不同,表明GET和POST请求之间发生了不同的事情,即使我想要非常相似的行为。为什么会发生这种情况?
我觉得奇怪的是,即使在POST请求之后似乎返回了错误的东西,我仍然可以解析我需要的网址 - 而不是__EVENTVALIDATION输入字段。
错误消息(代码的结尾)表示内容不包含我需要发出后续请求的数据,但导航到页面显示它确实拥有该数据,并且格式非常类似于第一页。
编辑:我正在根据它正在解析的HTML打开网页,而且肯定是不对的。运行下面的代码将打开这些页面。GET为我提供了一个包含以下数据的网站:
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="4424DBE6">
<input type="hidden" name="__VIEWSTATEENCRYPTED" id="__VIEWSTATEENCRYPTED" value="">
<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="TlIgNH
虽然POST以纯文本形式在页面底部生成一个包含所有数据的网站,如下所示:
|0|hiddenField|__EVENTTARGET||0|hiddenField|__EVENTARGUMENT||0|hiddenField|_
import requests
from lxml import html
from bs4 import BeautifulSoup
page = requests.get('http://search.cpsa.ca/physiciansearch')
print('got page!')
d = {"ctl00$ctl13": "ctl00$ctl13|ctl00$MainContent$physicianSearchView$btnSearch",
"ctl00$MainContent$physicianSearchView$txtLastName": "",
'ctl00$MainContent$physicianSearchView$txtFirstName': "",
'ctl00$MainContent$physicianSearchView$txtCity': "",
"__VIEWSTATEENCRYPTED":"",
'ctl00$MainContent$physicianSearchView$txtPostalCode': "",
'ctl00$MainContent$physicianSearchView$rblPractice': "",
'ctl00$MainContent$physicianSearchView$ddDiscipline': "",
'ctl00$MainContent$physicianSearchView$rblGender': "",
'ctl00$MainContent$physicianSearchView$txtPracticeInterests': "",
'ctl00$MainContent$physicianSearchView$ddApprovals': "",
'ctl00$MainContent$physicianSearchView$ddLanguage': "",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
"__EVENTARGUMENT": "",
'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
'__ASYNCPOST': 'true'}
h ={ "X-MicrosoftAjax":"Delta = true",
"X-Requested-With":"XMLHttpRequest",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}
urls = []
with requests.session() as s:
r = s.get("http://search.cpsa.ca/PhysicianSearch",headers=h)
soup = BeautifulSoup(r.content, "lxml")
tree = html.fromstring(r.content)
html.open_in_browser(tree)
ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATEGENERATOR"] = vsg
d["__VIEWSTATE"] = vs
r = s.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h)
print('opening in browser')
retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href')
print(retrievedUrls)
for url in retrievedUrls:
urls.append(url)
endSearch = False
while endSearch == False:
tree = html.fromstring(r.content)
html.open_in_browser(tree)
soup = BeautifulSoup(r.content, "lxml")
print('soup2:')
## BREAKS HERE
ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
## BREAKS HERE,
vs = soup.select("#__VIEWSTATE")[0]["value"]
vsg = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
d["ctl00$ctl13"] = "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"
d["__EVENTVALIDATION"] = ev
d["__EVENTTARGET"] = ""
d["__VIEWSTATEGENERATOR"] = vsg
d["__VIEWSTATE"] = vs
d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1
d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager"] = 1
d["ctl00$MainContent$physicianSearchView$gvResults$ctl01$btnNextPage"] = "Next"
r = requests.post('http://search.cpsa.ca/PhysicianSearch', data=d,headers=h)
tree = html.fromstring(r.content)
tree = html.fromstring(r.content)
retrievedUrls = tree.xpath('//*[@id="MainContent_physicianSearchView_gvResults"]/tr/td[2]/a/@href')
print(urls)
print(retrievedUrls)
endSearch = True
...
Traceback (most recent call last):
File "C:\Users\daniel.bak\workspace\Alberta Physician Scraper\main\main.py", line 63, in <module>
ev = soup.select("#__EVENTVALIDATION" )[0]["value"]
IndexError: list index out of range
答案 0 :(得分:5)
嗯,这几乎让我很精神,但它终于有效了,你必须要求获得每个帖子的新__EVENTVALIDATION
令牌:
import requests
from bs4 import BeautifulSoup
h = {"X-MicrosoftAjax": "Delta = true",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
'__ASYNCPOST': 'true'}
nxt_d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
"__ASYNCPOST": "true",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
r = s.get(url, headers=h)
soup = BeautifulSoup(r.content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
r = s.post(url, data=d, headers=h)
soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
nxt_d["__EVENTVALIDATION"] = ev
nxt_d["__VIEWSTATE"] = vs
r = s.post(url, data=nxt_d, headers=h)
如果你打开上一篇文章的来源,你会看到你点击第2页。我们需要添加更多逻辑来浏览所有页面,我会稍微添加它。
参数:
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"
是要转到的页面以及您要来的页面,以便在获取之后应该只需要更改。
这将获得所有页面,以编程方式提取大部分值,您可能可以使用正则表达式进行更多操作,但它在没有硬编码值的情况下拉动最多:
from lxml.html import fromstring
import requests
class Crawler(object):
def __init__(self, ua, url):
self.user_agent = ua
self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
self.post_data2 = {'__ASYNCPOST': 'true',
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
self.url = url
self.post_data1 = { '__ASYNCPOST': 'true'}
def populate(self, xml):
"""Pulls form post data keys and values for initial post."""
k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
self.post_data1[k1.get("name")] = k1.get("value")
self.post_data1[k2.get("name")] = k2.get("value")
self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]
def populate2(self, xml):
"""Pulls form post data keys and values,
for all subsequent posts,
setting initial page number values.
"""
data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
self.pge = data[0]
self.ev = data[1]
self.post_data2["__EVENTTARGET"] = self.ev
self.post_data2[self.ev] = "1"
self.post_data2[self.pge] = "2"
@staticmethod
def put_validation(xml, d):
"""Need to request new __EVENTVALIDATION for each post.
"""
ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
def next_page(self, d):
"""Increments the page number by one per iteration."""
e = self.post_data2[self.ev]
v = self.post_data2[self.pge]
self.post_data2[self.pge] = str(int(v) + 1)
self.post_data2[self.ev] = str(int(e) + 1)
def start(self):
with requests.session() as s:
# get initial page to pull __EVENTVALIDATION etc..
req = s.get(self.url, headers={"user-agent": self.user_agent}).content
# add __EVENTVALIDATION" to post data.
self.put_validation(fromstring(req), self.post_data1)
xml = fromstring(req)
# populate the rest of the post data.
self.populate(xml)
resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
# yield first page results.
yield resp
# fill post data for next pages.
self.populate2(resp)
# when this is an empty list, we will have hit the last page.
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
while not nxt:
# update __EVENTVALIDATION token and _VIEWSTATE.
self.put_validation(fromstring(s.get(self.url).content), self.post_data2)
# post to get next page of results.
yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
self.next_page(nxt_d)
ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
url = "http://search.cpsa.ca/PhysicianSearch"
c = Crawler(ua, url)
for tree in c.start():
# use tree