我有一个Scrapy蜘蛛,在我提交表单后试图挑选内容。 但是我从蜘蛛中获得的输出页面非常不一致。当我浏览网页浏览器时,我抓取的所有页面都包含数据。但Scrapy一直走过Form,直到结果页面,但大多数时候都找不到结果。当结果页面确实存在时。但它确实每次都找到最后一页。所以这似乎是会话的一个问题。
这是我蜘蛛的代码:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import scrapy
from scrapy.http import FormRequest, Request
from scrapy.shell import inspect_response
class MaharashtraSpider(scrapy.Spider):
name = "maharashtra2"
allowed_domains = ["mahavat.gov.in"]
start_urls = (
'http://mahavat.gov.in/',
)
def parse(self, response):
return Request('http://mahavat.gov.in/Tin_Search/Tinsearch.jsp',
callback=self.parse_form)
def parse_form(self, response):
base_no = 27020000034
no = base_no
for i in range(100):
yield FormRequest.from_response(
response,
formname='f1',
formdata={
"tin": "%sC" % no,
"pan": "",
"rc_no": "",
"fptecno": "",
"bptecno": "",
"DEALERNAME": "",
"Submit": "SEARCH"
},
callback=self.result_page)
no += 97 # The difference between pages with content is 97
def result_page(self, response):
url = response.xpath('//a[@class="search-head"]/@href').extract()[0]
url = response.urljoin(url)
yield Request(url, callback=self.process)
def process(self, response):
x = response.xpath("//td/text()").extract()
x = [x[i].strip() for i in range(1, len(x), 2)]
print "Dealer_Name = ", x[0]
print "Tin_Number = ", x[1]
# inspect_response(response, self)
我做错了什么?
这似乎是会话问题,而不是ajax问题,因为XHR
中没有任何post
请求。
另外,我有一种克服问题的黑客方式,但它超级慢。
这是我的hacky版本的代码。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import scrapy
from scrapy.http import FormRequest, Request
# from scrapy.shell import inspect_response
from VATs.items import VatsItem
class MaharashtraSpider(scrapy.Spider):
name = "maharashtra"
allowed_domains = ["mahavat.gov.in"]
start_urls = (
'http://mahavat.gov.in/',
)
def __init__(self, **kwargs):
super(MaharashtraSpider, self).__init__(**kwargs)
self.base_no = 27020000034 - 97
def parse(self, response):
yield Request('http://mahavat.gov.in/Tin_Search/Tinsearch.jsp',
callback=self.parse_form, dont_filter=True)
def parse_form(self, response):
self.base_no += 97
yield FormRequest.from_response(
response,
formname='f1',
formdata={
"tin": "%sC" % self.base_no,
"pan": "",
"rc_no": "",
"fptecno": "",
"bptecno": "",
"DEALERNAME": "",
"Submit": "SEARCH"
},
callback=self.result_page)
def result_page(self, response):
try:
url = response.xpath(
'//a[@class="search-head"]/@href').extract()[0]
url = response.urljoin(url)
yield Request(url, callback=self.process)
except IndexError:
yield Request('http://mahavat.gov.in/', callback=self.parse, dont_filter=True)
def process(self, response):
x = response.xpath("//td/text()").extract()
x = [x[i].strip() for i in range(1, len(x), 2)]
if x[0] != '':
item = VatsItem()
item["Dealer_Name"] = x[0]
item["Tin_Number"] = x[1]
item["Effective_Canceled_Date"] = x[2]
item["Address1"] = x[3]
item["Street_Name"] = x[4]
item["Address2"] = x[5]
item["Address3"] = x[6]
item["Taluka_Name"] = x[7]
item["District_Name"] = x[8]
item["City_Name"] = x[9]
item["State_Name"] = x[10]
item["Pin_Code"] = x[11]
item["Old_RC_No"] = x[12]
item["Location_Name"] = x[13]
item["Act_Name"] = x[14]
yield item
yield Request('http://mahavat.gov.in/', callback=self.parse, dont_filter=True)
# inspect_response(response, self)
这个确实有效,但确实很慢!
有人可以帮助我并告诉我为什么第一个脚本不起作用吗?
如果有办法让第二个脚本更快?