[编辑:代码经过修订,以显示我尝试做Granitosaurus'建议]
我需要执行一些预定义的搜索并将结果抓取1级深度,然后解析这些页面。这必须按顺序完成,因为除非根据搜索检索到这些结果,否则网站不会允许抓取结果。
我的预定义搜索位于列表中,我试图遍历该列表,但我无法使其工作。 Scrapy要么四处跳转,要么遍历所有start_requests,而不进行解析。有谁能告诉我这样做的方法?谢谢!
搜索字词只是我在开头列在列表中的日期范围对。
这是我的[新编辑]代码,我的非工作循环注释掉了:
import scrapy
monthlist = [{'from_date': '9/1/13', 'to_date': '9/30/13'},
{'from_date': '10/1/13', 'to_date': '10/31/13'},
{'from_date': '11/1/13', 'to_date': '11/30/13'},
{'from_date': '12/1/13', 'to_date': '12/31/13'},
{'from_date': '1/1/14', 'to_date': '1/31/14'},
{'from_date': '2/1/14', 'to_date': '2/28/14'},
{'from_date': '3/1/14', 'to_date': '3/31/14'},
{'from_date': '4/1/14', 'to_date': '4/30/14'},
{'from_date': '5/1/14', 'to_date': '5/31/14'},
{'from_date': '6/1/14', 'to_date': '6/30/14'},
{'from_date': '7/1/14', 'to_date': '7/31/14'},
{'from_date': '8/1/14', 'to_date': '8/31/14'},
{'from_date': '9/1/14', 'to_date': '9/30/14'}]
# remainder of list snipped for this post
class docketSpider(scrapy.Spider):
name = 'robey'
custom_settings = {
'DEPTH_PRIORITY': 1,
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue'
}
start_urls = ['http://justice1.dentoncounty.com/PublicAccessDC/default.aspx']
monkeycount = 0
def start_requests(self):
for i in range(33):
self.monkeycount =+ 1
yield scrapy.Request('http://justice1.dentoncounty.com/PublicAccessDC/default.aspx', dont_filter=True)
yield scrapy.Request('http://justice1.dentoncounty.com/PublicAccessDC/default.aspx', self.parse, dont_filter=True)
# yield scrapy.Request('http://justice1.dentoncounty.com/PublicAccessDC/Search.aspx?ID=200&NodeID=1256&NodeDesc=393rd%20Judicial%20District%20Court', dont_filter=True)
def parse(self, response):
request = scrapy.Request('http://justice1.dentoncounty.com/PublicAccessDC/Search.aspx?ID=200&NodeID=1256&NodeDesc=393rd%20Judicial%20District%20Court',
callback=self.retrieve_caselist)
return request
def retrieve_caselist(self, response):
# caserange = {'from_date': '9/1/2013', 'to_date': '9/30/2013'}
# for caserange in monthlist:
yield scrapy.FormRequest.from_response(
response,
url='http://justice1.dentoncounty.com/PublicAccessDC/Search.aspx?ID=200&NodeID=1256&NodeDesc=393rd%20Judicial%20District%20Court',
formdata={'CaseSearchMode': 'CaseNumber', 'SearchBy': '6', 'ExactName': 'on', 'PartySearchMode': 'Name', 'AttorneySearchMode': 'Name', 'cboState': 'AA', 'CaseStatusType': '0', 'SortBy': 'fileddate', 'SearchSubmit': 'Search', 'SearchType': 'CASE', 'StatusType': 'true', 'AllStatusTypes': 'true', 'SearchParams': 'DateFiled~~Search By:~~6~~Date Filed||chkExactName~~Exact Name:~~on~~on||AllOption~~Case Status:~~0~~All||DateFiledOnAfter~~Date Filed On or After:~~' + monthlist[self.monkeycount]['from_date'] + '~~' + monthlist[self.monkeycount]['from_date'] + '||DateFiledOnBefore~~Date Filed On or Before:~~' + monthlist[self.monkeycount]['to_date'] + '~~' + monthlist[self.monkeycount]['to_date'] + '||selectSortBy~~Sort By:~~Filed Date~~Filed Date', 'SearchMode': 'FILED', 'DateFiledOnAfter': monthlist[self.monkeycount]['from_date'], 'DateFiledOnBefore': monthlist[self.monkeycount]['to_date']},
callback=self.parse_caselist,
dont_filter=True
)
def parse_caselist(self, response):
for href in response.xpath('//html/body/table/tr/td/a/@href').extract():
full_url = response.urljoin(href)
yield scrapy.Request(full_url, callback=self.parse_casedetail)
def parse_casedetail(self, response):
yield {
'case_num': response.xpath('/html/body/div[2]/span/text()').extract(),
'attorneys': response.xpath('/html/body/table[4]/tr/td/b/text()').extract(),
'file_date': response.xpath('/html/body/table[3]/tr/td[3]/table/tr/td/table/tr[2]/td/b/text()').extract(),
'case_type': response.xpath('/html/body/table[3]/tr/td[3]/table/tr/td/table/tr[1]/td/b/text()').extract()
}
当它运行时,它循环遍历发出每个请求的循环,但是它无法在每个循环上调用self.parse(以及在self.parse之后依次运行的所有callables)。在循环之后,它解析一组搜索结果,然后就像它完成一样。这是来自PyCharm的pastebin log of it running。有任何想法吗?
答案 0 :(得分:0)
好像你受到网站会议的限制。您可以尝试在start_requests()
中生成多个会话,然后为每个会话生成唯一的搜索查询。
类似的东西:
class MySpider(scrapy.Spider):
name = "myspider"
start_url = "http://scrapy.org"
search_options = [
{'name': 'john', 'last_name': 'snow'},
{'name': 'james', 'last_name': 'bond'},
]
def start_requests(self):
# Start a web session for every search option
for i, option in enumerate(self.search_options):
yield Request(self.start_url,
dont_filter=True,
meta={'search_option': option,
'cookiejar': i})
def parse(self, response):
# make search request with those specific options!
yield FormRequest.from_response(response,
callback=self.parse_search,
meta=response.meta,
formdata=response['search_option'])
def parse_search(self, response):
# parse your search here