我在scrapy中有一个蜘蛛片段
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from infoseeker.items import InfoseekerItem as InfoItem
class SeekerSpider(CrawlSpider):
name = 'seeker'
allowed_domains = ['info.mzalendo.com']
start_urls = ['http://info.mzalendo.com/position/member-national-assembly/?page=1']
main_url = 'http://info.mzalendo.com/position/member-national-assembly/'
urls = []
retrieving = False
def parse(self, response):
if not self.retrieving:
selector_list = response.css('.position')
for selector in selector_list:
self.urls.append(selector.css('a::attr(href)').extract()[0])
found = response.css('.next::attr(href)').extract()
if found:
next_page = self.main_url + found[0] #uses ?page=2, ?page=3 format so appended to main url to avoid issues
else:
next_page = None
if next_page is not None:
yield response.follow(next_page, self.parse)
else:
self.retrieving = True
#should run once all urls have been found
for url in self.urls:
pass #get content for url to be parsed
由于我要查询的内容列在页面上,首先我查询了所有页面并检索了网址列表并将其存储在self.urls
这个过程完成后我打算做什么开始查询网址,现在检索有用的信息。
不确定yield
是否适合使用。
答案 0 :(得分:0)
您需要做的是回调。调用另一个函数来处理你想要抓取的url。
CALLBACK(callable) - 将使用此请求的响应(一旦下载)调用的函数作为其第一个参数。有关更多信息,请参阅下面将其他数据传递给回调函数。如果请求未指定回调,则spider的parse()方法将为
Here,就像这样
import .....
class SeekerSpider(CrawlSpider):
name = 'seeker'
allowed_domains = ['info.mzalendo.com']
start_urls = ['http://info.mzalendo.com/position/member-national-assembly/?page=1']
main_url = 'http://info.mzalendo.com/position/member-national-assembly/'
urls = []
retrieving = False
def parse(self, response):
.......
# All the above code in between.
.......
#should run once all urls have been found
for url in self.urls:
#get content for url to be parsed. USE CALLBACK
yield scrapy.Request(url, callback=self.parse_url)
def parse_url(self, response):
#Do whatever you want with the response of desired url.