我正在尝试使用以下代码对网站www.example.com
进行爬网-
我的目标是捕获所有URL,以及req标头,res-body,res标头和cookie。对于我正在抓取的网站域中找到的所有网址,都应该发生这种情况。
class RSpider(CrawlSpider):
name = 'thursday-example'
allowed_domains = ['localhost']
#start_urls = ['/dashboard']
login_url = 'http://www.example.com/authorize'
rules = [Rule(LinkExtractor(allow=(), deny=('/auth/logout*', \
'/logout*')), \
callback='parse_me', follow=True)]
def start_requests(self):
print(">>> Spider {} started".format(self.name))
yield Request(url=self.login_url, callback=self.login_example)
def login_example(self, response):
token = response.xpath('//*[@name="user_token"]/@value').extract_first()
print('>>> Token : {}'.format(token))
data = {
'user_token' : token,
'username' : 'admin',
'password' : 'password'
}
print('>>> Logging in')
return [FormRequest.from_response(response, formdata=data, callback=self.after_login)]
def after_login(self, response):
open_in_browser(response)
if 'Login failed' not in response.body:
print('>>> Login successful!')
else:
print('>>> Login failed!')
yield Request(url="http://www.example.com/dashboard", callback=self.parse, dont_filter=True)
def parse_me(self, response):
print('>>> parsing {}'.format(response.url))
items = []
for link in LinkExtractor().extract_links(response):
yield Request(link.url, callback=self.parse_me)
yield self.exporter(response)
def exporter(self, response):
items = []
item = TesterItem()
item['url'] = response.url
item['req-body'] = response.request.body
item['res-body'] = response.body
item['req-headers'] = response.headers
item['res-headers'] = response.request.headers
item['res-set_cookies'] = response.headers[set_cookies]
items.append(item)
return item
def closed(self, reason):
print('>>> Done')
我认为我仍然缺少一些URL。我是否正确使用了自定义解析函数,还是有更好的方法来实现此目的? 我需要抓取整个网站(会有重复项,但是我正在使用dupfilter调试器来处理重复项)