在phantomjs中使用策略模式

时间:2016-09-12 20:08:58

标签: javascript phantomjs

我试图在我的抓取工具中实施策略模式,我认为使用不同的策略来抓取不同的网站是很好的。所以我希望page.evaluate内的内容根据当前正在运行的网站而有所不同。 page.evaluate中的注释代码有效,但有没有办法可以将它提取到函数中?我试图运行this.findJobs()但没有成功。

"use strict";

var Crawler = function() {
    this.page = require('webpage').create();
    this.website = "";
    this.jobs_list = [];

};

Crawler.prototype.setStrategy = function(company) {
    this.website = company;
};

Crawler.prototype.findJobData = function() {
    return this.website.findJobData();
};

Crawler.prototype.collectJobData = function() {
    var page = require('webpage').create();
    page.onConsoleMessage = function(msg) { console.log(msg) };

    page.open('URL', function (status) {
        page.includeJs("https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js", function() {
            var temp_jobs = page.evaluate(this.findJobs());

                /*
                var jobs = [];
                var job;
                    $('ul.job-list').each(function(){
                    $(this).find('li').each(function(){
                        var job_link = $(this).find('a');
                        var url = "URL" + job_link.attr("href");
                        var location = $(this).find('span').text();

                        job = {title: job_link.text(), url: url, location: location, description: ""}
                        jobs.push(job);
                        console.log(job.title, job.url, job.location);
                    })
                });
                return jobs;*/
            console.log(temp_jobs[0].title)

            phantom.exit(0);
        });
    });

};

var strategy_a = function() {

    this.findJobs = function() {
            var jobs = [];
            var job;
            $('ul.job-list').each(function(){
                $(this).find('li').each(function(){
                    var job_link = $(this).find('a');
                    var url = "URL" + job_link.attr("href");
                    var location = $(this).find('span').text();

                    job = {title : job_link.text(), url : url, location : location, description : ""};
                    jobs.push(job);
                    console.log(job.title, job.url, job.location);
                })
            });
            return jobs;
    };
};


var strategy_a = new strategy_a();
var crawler = new Crawler();

crawler.setStrategy(strategy_a);
crawler.collectJobData();

1 个答案:

答案 0 :(得分:1)

你有两个问题:

  • 您打算使用page.evaluate(this.findJobs);代替page.evaluate(this.findJobs());

  • {li>

    this内部page.includeJs回调不是对Crawler实例的引用。

这应该有效:

Crawler.prototype.collectJobData = function() {
    var page = this.page;
    var self = this;
    page.onConsoleMessage = function(msg) { console.log(msg) };

    page.open('URL', function (status) {
        page.includeJs("https://ajax.googleapis.com/ajax/libs/jquery/3.1.0/jquery.min.js", function() {
            var temp_jobs = page.evaluate(self.website.findJobs);
            console.log(temp_jobs[0].title)

            phantom.exit(0);
        });
    });
};

请注意,您已生成多个页面而未使用所有页面,因此我删除了第二个require('webpage').create()