Question

有没有办法以某种方式使用selenium与LinkExtractor一起处理javascript？我想抓取网站并在网页上提取pdf文件。许多PDF仅在javascript执行后才可用。

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from selenium import webdriver 
from scrapy.http import Request

class Fetcher(CrawlSpider):
    name = "Fetcher"
    allowed_domains = ["example.com"]
    start_urls = ["http://www.example.com/"]

    rules = [Rule(LinkExtractor(allow=()), follow=True, callback='parse_item')]

    def __init__(self, *a, **kw):
        super(Fetcher, self).__init__(*a, **kw)
        self.driver = webdriver.PhantomJS
        self.links = open("links.txt", "a")
        self.pdfs = open("pdfs.txt", "a")

    def parse_start_url(self, response):
        #Do stuff

    def parse_item(self, response):
        #Do stuff

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(Fetcher)
process.start() # the script will block here until the crawling is finished

Answer 1

我建议您使用DownloaderMiddleware来获取这些请求，并使用selenium下载页面并将HtmlResponse返回给蜘蛛。像这样：

from selenium import webdriver
from scrapy.http import HtmlResponse

class SeleniumMiddleware(object):

    def __init__(self):
        self.driver = webdriver.PhantomJS() # Or whichever browser you want

    # Here you get the request you are making to the urls which your LinkExtractor found and use selenium to get them and return a response.
    def process_request(self, request, spider):
        self.driver.get(request.url)
        body = self.driver.page_source
        return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)

请记住将中间件添加到您的设置中：

DOWNLOADER_MIDDLEWARES = {
    'myproject.middlewares.SeleniumMiddleware': 543,
}

使用scrapy LinkExtractor和Selenium

1 个答案: