CasperJS并在抓取时将数据插入数据库

时间:2015-06-30 01:13:20

标签: javascript node.js casperjs rethinkdb

我有一个我在这里找到的蜘蛛脚本:http://planzero.org/blog/2013/03/07/spidering_the_web_with_casperjs

基本上我想要做的是使用rethinkDB将所有链接放在一个被抓取的表中。所以我基本上创建了另一个名为args.js的脚本,它接受一个参数,链接,然后将它存储在数据库中。

args.js

 // node args.js -link http://somesite.com
    var r = require('rethinkdb');
    var args = process.argv.slice(2);
    var link = args[1];
    var connection = null;

    var connection = null;
    r.connect( {host: 'localhost', port: 28015}, function(err, conn) {
        if (err) throw err;
        connection = conn;
        conn.addListener('error', function(e) {
            processNetworkError(e);
        });

        conn.addListener('close', function() {
            cleanup();
        });

        insertLink(conn);
    });

    var myDB = r.db("links").table("href");

    function insertLink() {
      myDB.insert({
        "link" : link
      }).run(connection);
      process.exit()
    };   

它运作得很好。

现在,我的想法是我可以使用我的蜘蛛脚本中的child_process类型脚本来调用此文件并插入这样的数据(因为我无法直接从casperjs脚本调用rehtinkdb)

spider.js

// Set the start URL
var startUrl = 'http://google.com/';

// URL variables
var visitedUrls = [], pendingUrls = [];

// Create instances
var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ });
var utils = require('utils')
var helpers = require('./helpers')

var exec = require('child_process').exec,
    child;

// Spider from the given URL
function spider(url) {

  child = exec('node args.js -link ' + url,
    function (error, stdout, stderr) {
      console.log(stdout);
      if (error !== null) {
        console.log('exec error: ' + error);
      }
  });

    // Add the URL to the visited stack
    visitedUrls.push(url);

    // Open the URL
    casper.open(url).then(function() {

        // Set the status style based on server status code
        var status = this.status().currentHTTPStatus;
        switch(status) {
            case 200: var statusStyle = { fg: 'green', bold: true }; break;
            case 404: var statusStyle = { fg: 'red', bold: true }; break;
             default: var statusStyle = { fg: 'magenta', bold: true }; break;
        }

        // Display the spidered URL and status
        this.echo(this.colorizer.format(status, statusStyle) + ' ' + url);

        // Find links present on this page
        var links = this.evaluate(function() {
            var links = [];
            Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
                links.push(e.getAttribute('href'));
            });
            return links;
        });

        // Add newly found URLs to the stack
        var baseUrl = this.getGlobal('location').origin;
        Array.prototype.forEach.call(links, function(link) {
            var newUrl = helpers.absoluteUri(baseUrl, link);
            if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) {

                //casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' }));
                pendingUrls.push(newUrl);
            }
        });

        // If there are URLs to be processed
        if (pendingUrls.length > 0) {
            var nextUrl = pendingUrls.shift();
            //this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' }));
            spider(nextUrl);
        }

    });

}

// Start spidering
casper.start(startUrl, function() {
    spider(startUrl);
});

// Start the run
casper.run();

这不起作用。蜘蛛工作正常,但我的脚本似乎从未被调用过。我创建了另一个名为test.js的文件,它基本上只是直接调用args.js

test.js

var exec = require('child_process').exec,
    child;
var url = "http://somesite.com"
child = exec('node args.js -link ' + url,
  function (error, stdout, stderr) {
    console.log(stdout);
    if (error !== null) {
      console.log('exec error: ' + error);
    }
});

这很好用......

编辑:我将exec()代码添加到一个函数中,然后在其中放入一个console.log(),似乎console.log正在注销,因此我的函数被调用。但是节点调用似乎没有。

1 个答案:

答案 0 :(得分:1)

我想出了这个问题。这是我在spider.js

中所做的
// Set the start URL
var startUrl = 'http://google.com/';

// URL variables
var visitedUrls = [], pendingUrls = [];

// Create instances
var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ });
var utils = require('utils')
var helpers = require('./helpers')

var cp = require('child_process');

function addUrl(myUrl) {
  var params = ['args.js','--link', myUrl];
  cp.execFile('node',params,{},function(_,stdout,stderr){
      console.log(stdout);
  });
}

// Spider from the given URL
function spider(url) {

  addUrl(url);  
    // Add the URL to the visited stack
    visitedUrls.push(url);

    // Open the URL
    casper.open(url).then(function() {

        // Set the status style based on server status code
        var status = this.status().currentHTTPStatus;
        switch(status) {
            case 200: var statusStyle = { fg: 'green', bold: true }; break;
            case 404: var statusStyle = { fg: 'red', bold: true }; break;
             default: var statusStyle = { fg: 'magenta', bold: true }; break;
        }

        // Display the spidered URL and status
        this.echo(this.colorizer.format(status, statusStyle) + ' ' + url);

        // Find links present on this page
        var links = this.evaluate(function() {
            var links = [];
            Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
                links.push(e.getAttribute('href'));
            });
            return links;
        });

        // Add newly found URLs to the stack
        var baseUrl = this.getGlobal('location').origin;
        Array.prototype.forEach.call(links, function(link) {
            var newUrl = helpers.absoluteUri(baseUrl, link);
            if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) {

                //casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' }));
                pendingUrls.push(newUrl);
            }
        });

        // If there are URLs to be processed
        if (pendingUrls.length > 0) {
            var nextUrl = pendingUrls.shift();
            //this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' }));
            spider(nextUrl);
        }

    });

}

// Start spidering
casper.start(startUrl, function() {
    spider(startUrl);
});

// Start the run
casper.run();

我添加了一个回调函数addUrl(myUrl),它使用execFile子进程并直接调用节点,并将数组中的参数作为['file-name','argument prefix','argument']文件中定义的args.js。< / p>