我首先尝试登录,然后从登录后可见的页面中提取数据。我的蜘蛛是 -
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.spiders import BaseSpider
from scrapy.http import FormRequest
from loginform import fill_login_form
class ElementSpider(scrapy.Spider):
name = 'example'
start_urls = ['https://github.com/login']
def parse(self, response):
return [FormRequest.from_response(response,
formdata={'login': 'myid', 'password': 'my password'},
callback=self.after_login)]
def after_login(self, response):
if "Incorrect username or password" in response.body:
print "hey"
self.log("Login failed", level=log.ERROR)
return
else:
return Request(url="https://github.com/settings/emails",
callback=self.parse_data)
def parse_data(self, response):
email = response.xpath('//div[@class="boxed-group-inner"]/li[@class="clearfix css-truncate settings-email"]/span[@class="css-truncate-target"]/text()').extract()
print email
我在输出中什么也没得到。 执行中是否有错误???
答案 0 :(得分:0)
您尚未创建班级ElementSpider
的实例
首先需要创建一个类的实例。
通知
每个班级都应该有一个构造函数,因此建议您在班级中实施__init__
方法。
这就是代码的外观。
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from scrapy.spiders import BaseSpider
from scrapy.http import FormRequest
from loginform import fill_login_form
class ElementSpider(scrapy.Spider):
name = 'example'
start_urls = ['https://github.com/login']
def __init__(self, *args, **kwargs):
super(ElementSpider, self).__init__(*args, **kwargs)
def parse(self, response):
return [FormRequest.from_response(response,
formdata={'login': 'myid', 'password': 'my password'},
callback=self.after_login)]
def after_login(self, response):
if "Incorrect username or password" in response.body:
print "hey"
self.log("Login failed", level=log.ERROR)
return
else:
return Request(url="https://github.com/settings/emails",
callback=self.parse_data)
def parse_data(self, response):
email = response.xpath('//*[@id="settings-emails"]/li/span[@class="css-truncate-target"]').extract()
print email
if __name__ == "__main__":
spider = ElementSpider()
答案 1 :(得分:0)
在实现中出现错误,因为github.com/settings/emails页面中没有 boxed-group-inner” 类。
要进行更正,请将response.xpath更改为:
response.xpath('// ul [@ id =“ settings-emails”] / li [@ class =“ Box-row clearfix css-truncate settings-email”] / span [@ class =“ css-truncate-target“] / text()')。extract()
在这种情况下,您可以将html树的末端节点直接用作:
response.xpath('// span [@ class =“ css-truncate-target”] / text()')。extract()