我正在尝试使用javascript来抓取需要登录和加载数据的页面。目前我可以使用scrapy成功登录。但我的蜘蛛无法看到我需要的数据,因为数据是使用javascript加载的。
我做了一些搜索,发现Selenium可能是一个可能的解决方案。我想使用selenium来创建浏览器并查看页面。看来我应该使用selenium webdriver工具。但我不知道该怎么做。有谁知道我应该在哪里以及如何向我的蜘蛛添加硒代码?
非常感谢。
#My spider looks like
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request, FormRequest
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import selenium
import time
from login.items import SummaryItem
class titleSpider(BaseSpider):
name = "titleSpider"
allowed_domains = ["domain.com"]
start_urls = ["https://www.domain.com/login"]
# Authentication
def parse(self, response):
return [FormRequest.from_response(response,
formdata={'session_key': 'myusername', 'session_password': 'mypassword'},
callback=self.after_login)]
# Request the webpage
def after_login(self, response):
# check login succeed before going on
if "Error" in response.body:
print "Login failed"
else:
print "Login successfully"
return Request(url="https://www.domain.com/result1",
callback=self.parse_page) # this page has some data loaded using javascript
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
# How can I know selenium passes authentication?
self.selenium = selenium("localhost", 4444, "*firefox", "https://www.domain.com/result1")
print "Starting the Selenium Server!"
self.selenium.start()
print "Successfully, Started the Selenium Server!"
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
# Parse the page
def parse_page(self, response):
item = SummaryItem()
hxs = HtmlXPathSelector(response)
item['name']=hxs.select('//span[@class="name"]/text()').extract() # my spider cannot see the name.
# Should I add selenium codes here? Can it load the page that requires authentication?
sel= self.selenium
sel.open(response.url)
time.sleep(4)
item['name']=sel.select('//span[@class="name"]/text()').extract() #
return item
答案 0 :(得分:0)
您可以尝试这样的事情
def __init__(self):
BaseSpider.__init__(self)
self.selenium = webdriver.Firefox()
def __del__(self):
self.selenium.quit()
print self.verificationErrors
def parse(self, response):
# Initialize the webdriver, get login page
sel = self.selenium
sel.get(response.url)
sleep(3)