我正在学习使用scrapy来获取有关linkedin的职位信息。现在我想我可以使用scrapy登录,并到达包含作业链接信息的正确页面。但是,当我尝试使用xpath选择作业链接信息时,它返回错误的值。任何人都可以帮助我吗?
这是我的代码:
import scrapy
from scrapy.http import Request, FormRequest
class LinkedinSpider2(scrapy.Spider):
name = "linkedin2"
allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
start_url = 'http://www.linkedin.com/jobs/search/?keywords=data%20analyst&location=United%20States&locationId=us%3A0'
def start_requests(self):
self.log("start_request")
#"""This function is called before crawling starts."""
yield Request(url=self.login_page, callback=self.login, dont_filter=True)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'session_key': '***@gmail.com', 'session_password': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "My Network" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
return Request(url=self.start_url, callback=self.parse_item)
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
def parse_item(self, response):
self.log(response.url)
if 'Cognius' in response.body:
self.log('***right page***')
self.log(response.xpath("//a/@href").extract())
else:
self.log('***wrong page***')
这是输出: enter image description here
以下是该页面的来源: enter image description here