我在抓取网页时不是菜鸟。但是当我开始抓这个网址时 bse insider trading
我收到一个非常奇怪的错误。
关于此页面的细节。此页面属于印度市场的一个主要交易所。默认情况下,当此页面加载时,它会显示最近的内幕交易信息。当我们选择日期并提交时,会发出一个发布请求并显示数据。当然,数据跨越多个页面。
当我发送带有自定义日期的相应帖子数据时,我会收到500
的状态代码page$2
,这意味着我想要第2页的信息,其余所有页面都返回正常。我试图urlencode这个post变量,并尝试随机的东西,但没有运气。请问有人也可以确认一下这个案例。
我正在使用python,requests,lxml进行抓取,这是我的代码
import requests
from lxml import html
import urllib
url = "http://www.bseindia.com/corporates/Insider_Trading.aspx?expandable=0"
data = {'ctl00$ContentPlaceHolder1$GetQuote1$smartSearch':'Enter Scrip Name / Code / ID','ctl00$ContentPlaceHolder1$GetQuote1$hdnCode':'','WINDOW_NAMER':'1','__EVENTTARGET':'ctl00$ContentPlaceHolder1$gvData','__EVENTARGUMENT':'Page$2', 'ctl00$ContentPlaceHolder1$fmdate':'20140923', 'ctl00$ContentPlaceHolder1$eddate':'20140929', 'ctl00$ContentPlaceHolder1$txtDate':'01/01/2014', 'ctl00$ContentPlaceHolder1$txtTodate':'29/09/2014', 'ctl00$ContentPlaceHolder1$GetQuote1$hdnCode':'', 'ctl00$ContentPlaceHolder1$ddlregulation':'ALL', '__VIEWSTATE':'', '__EVENTVALIDATION':'/wEWGQLoxuedDgKJsYefCgK6rpDlDwL8np6XAwLl44LQAgK0w8TlCwKHlNXODQKW0sv1BgLei4rQAwKExL/8BQLJsJHgCQLhsb3hCQLJsLmaCAL40JWiCgK9vIn8DAKN+qTQCAKN+qzQCAKN+rjQCAKN+rTQCAKN+qDQCAKN+rzQCAKN+ojQCAKN+oTQCALj8ODgCQKI2cKdDA=='}
cookies = {'_gat':'1', 'expandable':'0c', '_asc':'099bb6b2148be3ebcdb9a1f31af', '__auc':'70dfa206148b7750a132dbe342b', '_ga':'GA1.2.35458869.1411827174'}
headers = {'Referer':'http://www.bseindia.com/corporates/Insider_Trading.aspx?expandable=0', 'Orgin':'http://www.bseindia.com', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'}
for i in range(1,10):
data['__EVENTARGUMENT'] = ('Page$'+str(i))
print data['__EVENTARGUMENT']
r = requests.post(url, data=data, cookies=cookies );
if r.status_code != 200:
print r.reason
print i
root = html.fromstring(r.text)
a = root.xpath('//*[@id="ctl00_ContentPlaceHolder1_gvData"]/tr')
for b in a[2:-1]:
#print b.findall('td')[1].text_content()
pass
答案 0 :(得分:0)
而不是硬编码请求参数,从页面解析它,然后按照分页:
import re
from lxml import html
import requests
def get_data(tree):
for row in tree.xpath('//tr[@class="TTRow"]/td[1]'):
print row.text
url = "http://www.bseindia.com/corporates/Insider_Trading.aspx?expandable=0"
headers = {'Referer': 'http://www.bseindia.com/corporates/Insider_Trading.aspx?expandable=0',
'Origin': 'http://www.bseindia.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
# start session
session = requests.Session()
# get the main page
response = session.get('http://www.bseindia.com/corporates/Insider_Trading.aspx?expandable=0', headers=headers)
root = html.fromstring(response.content)
# since this is the first page - parse the data
print "PAGE 1"
get_data(root)
print "------"
data = {
'ctl00$ContentPlaceHolder1$GetQuote1$smartSearch': 'Enter Scrip Name / Code / ID',
'ctl00$ContentPlaceHolder1$fmdate': '20140923',
'ctl00$ContentPlaceHolder1$eddate': '20140929',
'ctl00$ContentPlaceHolder1$txtDate': '',
'ctl00$ContentPlaceHolder1$txtTodate': '',
'ctl00$ContentPlaceHolder1$GetQuote1$hdnCode': '',
'ctl00$ContentPlaceHolder1$ddlregulation': 'ALL',
'myDestination': '#',
'WINDOW_NAMER': '1',
'__VIEWSTATE': root.find('.//input[@id="__VIEWSTATE"]').attrib['value'],
'__EVENTVALIDATION': root.find('.//input[@id="__EVENTVALIDATION"]').attrib['value']
}
pattern = re.compile("javascript:__doPostBack\('(.*?)','(.*?)'\)")
for index, row in enumerate(root.xpath('//tr[@class="pgr"]//td/a'), start=2):
print "PAGE %d" % index
target, argument = pattern.search(row.attrib['href']).groups()
data['__EVENTTARGET'] = target
data['__EVENTARGUMENT'] = argument
response = session.post(url, data=data, headers=headers)
root = html.fromstring(response.content)
get_data(root)
print "------"
打印(每页中的第一列内容):
PAGE 1
531807
532706
532959
531807
532840
533400
536507
533400
533304
506395
506395
507717
507717
506395
506395
506395
506395
531807
512393
532268
532466
532689
532689
532706
532706
------
PAGE 2
500228
500246
508969
508969
531802
532532
532532
532832
532832
500228
508969
508969
531802
500124
500124
511243
512393
512393
512393
512393
512393
512393
512393
512393
512393
------
PAGE 3
512393
512393
532466
532508
532799
532799
532799
532799
532799
532799
532799
532799
502355
502355
502355
501700
501700
500247
500247
500875
506615
500247
500875
500875
500875
------
PAGE 4
523838
523838
500875
511503
511503
511503
506615
506615
506615
506615
506615
506615
506615
506615
506615
506615
511503
511503
511503
506395
506395
524348
506685
506685
506685
------
PAGE 5
506685
------