有没有办法以某种方式使用selenium与LinkExtractor一起处理javascript?我想抓取网站并在网页上提取pdf文件。许多PDF仅在javascript执行后才可用。
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from selenium import webdriver
from scrapy.http import Request
class Fetcher(CrawlSpider):
name = "Fetcher"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
rules = [Rule(LinkExtractor(allow=()), follow=True, callback='parse_item')]
def __init__(self, *a, **kw):
super(Fetcher, self).__init__(*a, **kw)
self.driver = webdriver.PhantomJS
self.links = open("links.txt", "a")
self.pdfs = open("pdfs.txt", "a")
def parse_start_url(self, response):
#Do stuff
def parse_item(self, response):
#Do stuff
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(Fetcher)
process.start() # the script will block here until the crawling is finished
答案 0 :(得分:0)
我建议您使用DownloaderMiddleware来获取这些请求,并使用selenium下载页面并将HtmlResponse返回给蜘蛛。像这样:
from selenium import webdriver
from scrapy.http import HtmlResponse
class SeleniumMiddleware(object):
def __init__(self):
self.driver = webdriver.PhantomJS() # Or whichever browser you want
# Here you get the request you are making to the urls which your LinkExtractor found and use selenium to get them and return a response.
def process_request(self, request, spider):
self.driver.get(request.url)
body = self.driver.page_source
return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
请记住将中间件添加到您的设置中:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.SeleniumMiddleware': 543,
}