我正在使用Scrapy,Python来抓取一个网站http://www.gasunietransportservices.nl/en/dataport-pages/lng-terminals/nominations,该网站将在excel文件中提供数据,但首先我需要设置过滤器。
我在我的代码中完全实现了请求,但在FormRequest中实现了VIEWSTATE和EVENTVALIDATION是动态的,它们随着每个请求而变化。我想要做的是刮掉它们,以便我可以在下一次请求中提供它们。
这两个字段都在HTML代码中提供,但隐藏了< the __VIEWSTATE>。我试图使用Selenium,但没有有用的结果。
start_urls = [
'http://www.gasunietransportservices.nl/en/dataport-pages/lng-terminals/nominations',
'http://dataport.gastransportservices.nl/default.aspx?ReportPath=%2fTransparency%2fNominationsPerNetworkpoint&ReportTitle=NominationsPerNetworkpoint&TransparencySegment=06',
]
def start_requests(self):
yield Request(
url=self.start_urls[0],
callback=self.parse
)
def parse(self, response):
sel = Selector(response)
#import ipdb; ipdb.set_trace()
view_state = sel.xpath('//input[@id="__VIEWSTATE"]/@value').extract()
event_validation = sel.xpath('//input[@id="__EVENTVALIDATION"]/@value').extract()
formdata = {
'scriptManager': 'scriptManager|ReportViewerControl$ctl09$Reserved_AsyncLoadTarget',
'__EVENTTARGET': 'ReportViewerControl$ctl09$Reserved_AsyncLoadTarget',
'__EVENTARGUMENT': '',
'__VIEWSTATE': '{}'.format(view_state[0]),
'__VIEWSTATEGENERATOR': 'CA0B0334',
'__EVENTVALIDATION': '{}'.format(event_validation[0]),
'ReportViewerControl$ctl03$ctl00': '',
'ReportViewerControl$ctl03$ctl01': '',
'ReportViewerControl$ctl10': 'ltr',
'ReportViewerControl$ctl11': 'standards',
'ReportViewerControl$AsyncWait$HiddenCancelField': 'False',
'ReportViewerControl$ctl04$ctl03$txtValue': 'LNG Terminals',
'ReportViewerControl$ctl04$ctl05$txtValue': 'ROTTERDAM (GATE) - 301345',
'ReportViewerControl$ctl04$ctl07$txtValue': '5-10-2015 0:00:00',
'ReportViewerControl$ctl04$ctl09$ddValue': '1',
'ReportViewerControl$ctl04$ctl11$txtValue': '6-10-2015 0:00:00',
'ReportViewerControl$ctl04$ctl13$ddValue': '1',
'ReportViewerControl$ctl04$ctl15$ddValue': '1',
'ReportViewerControl$ctl04$ctl17$ddValue': '1',
'ReportViewerControl$ctl04$ctl05$divDropDown$ctl01': 'on',
'ReportViewerControl$ToggleParam$store': '',
'ReportViewerControl$ToggleParam$collapse': 'false',
'ReportViewerControl$ctl05$ctl00$CurrentPage': '',
'ReportViewerControl$ctl08$ClientClickedId': '',
'ReportViewerControl$ctl07$store': '',
'ReportViewerControl$ctl07$collapse': 'false',
'ReportViewerControl$ctl09$VisibilityState$ctl00': 'None',
'ReportViewerControl$ctl09$ScrollPosition': '',
'ReportViewerControl$ctl09$ReportControl$ctl02': '',
'ReportViewerControl$ctl09$ReportControl$ctl03': '',
'ReportViewerControl$ctl09$ReportControl$ctl04': '100',
'__ASYNCPOST': 'true',
}
yield FormRequest(
url=self.start_urls[1],
formdata=formdata,
callback=self.parse_filter,
)
def parse_filter(self, response):
import ipdb; ipdb.set_trace()