我有一个我在这里找到的蜘蛛脚本:http://planzero.org/blog/2013/03/07/spidering_the_web_with_casperjs
基本上我想要做的是使用rethinkDB将所有链接放在一个被抓取的表中。所以我基本上创建了另一个名为args.js的脚本,它接受一个参数,链接,然后将它存储在数据库中。
// node args.js -link http://somesite.com
var r = require('rethinkdb');
var args = process.argv.slice(2);
var link = args[1];
var connection = null;
var connection = null;
r.connect( {host: 'localhost', port: 28015}, function(err, conn) {
if (err) throw err;
connection = conn;
conn.addListener('error', function(e) {
processNetworkError(e);
});
conn.addListener('close', function() {
cleanup();
});
insertLink(conn);
});
var myDB = r.db("links").table("href");
function insertLink() {
myDB.insert({
"link" : link
}).run(connection);
process.exit()
};
它运作得很好。
现在,我的想法是我可以使用我的蜘蛛脚本中的child_process
类型脚本来调用此文件并插入这样的数据(因为我无法直接从casperjs脚本调用rehtinkdb)
// Set the start URL
var startUrl = 'http://google.com/';
// URL variables
var visitedUrls = [], pendingUrls = [];
// Create instances
var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ });
var utils = require('utils')
var helpers = require('./helpers')
var exec = require('child_process').exec,
child;
// Spider from the given URL
function spider(url) {
child = exec('node args.js -link ' + url,
function (error, stdout, stderr) {
console.log(stdout);
if (error !== null) {
console.log('exec error: ' + error);
}
});
// Add the URL to the visited stack
visitedUrls.push(url);
// Open the URL
casper.open(url).then(function() {
// Set the status style based on server status code
var status = this.status().currentHTTPStatus;
switch(status) {
case 200: var statusStyle = { fg: 'green', bold: true }; break;
case 404: var statusStyle = { fg: 'red', bold: true }; break;
default: var statusStyle = { fg: 'magenta', bold: true }; break;
}
// Display the spidered URL and status
this.echo(this.colorizer.format(status, statusStyle) + ' ' + url);
// Find links present on this page
var links = this.evaluate(function() {
var links = [];
Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
links.push(e.getAttribute('href'));
});
return links;
});
// Add newly found URLs to the stack
var baseUrl = this.getGlobal('location').origin;
Array.prototype.forEach.call(links, function(link) {
var newUrl = helpers.absoluteUri(baseUrl, link);
if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) {
//casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' }));
pendingUrls.push(newUrl);
}
});
// If there are URLs to be processed
if (pendingUrls.length > 0) {
var nextUrl = pendingUrls.shift();
//this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' }));
spider(nextUrl);
}
});
}
// Start spidering
casper.start(startUrl, function() {
spider(startUrl);
});
// Start the run
casper.run();
这不起作用。蜘蛛工作正常,但我的脚本似乎从未被调用过。我创建了另一个名为test.js的文件,它基本上只是直接调用args.js
var exec = require('child_process').exec,
child;
var url = "http://somesite.com"
child = exec('node args.js -link ' + url,
function (error, stdout, stderr) {
console.log(stdout);
if (error !== null) {
console.log('exec error: ' + error);
}
});
这很好用......
编辑:我将exec()代码添加到一个函数中,然后在其中放入一个console.log(),似乎console.log正在注销,因此我的函数被调用。但是节点调用似乎没有。
答案 0 :(得分:1)
我想出了这个问题。这是我在spider.js
// Set the start URL
var startUrl = 'http://google.com/';
// URL variables
var visitedUrls = [], pendingUrls = [];
// Create instances
var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ });
var utils = require('utils')
var helpers = require('./helpers')
var cp = require('child_process');
function addUrl(myUrl) {
var params = ['args.js','--link', myUrl];
cp.execFile('node',params,{},function(_,stdout,stderr){
console.log(stdout);
});
}
// Spider from the given URL
function spider(url) {
addUrl(url);
// Add the URL to the visited stack
visitedUrls.push(url);
// Open the URL
casper.open(url).then(function() {
// Set the status style based on server status code
var status = this.status().currentHTTPStatus;
switch(status) {
case 200: var statusStyle = { fg: 'green', bold: true }; break;
case 404: var statusStyle = { fg: 'red', bold: true }; break;
default: var statusStyle = { fg: 'magenta', bold: true }; break;
}
// Display the spidered URL and status
this.echo(this.colorizer.format(status, statusStyle) + ' ' + url);
// Find links present on this page
var links = this.evaluate(function() {
var links = [];
Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
links.push(e.getAttribute('href'));
});
return links;
});
// Add newly found URLs to the stack
var baseUrl = this.getGlobal('location').origin;
Array.prototype.forEach.call(links, function(link) {
var newUrl = helpers.absoluteUri(baseUrl, link);
if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) {
//casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' }));
pendingUrls.push(newUrl);
}
});
// If there are URLs to be processed
if (pendingUrls.length > 0) {
var nextUrl = pendingUrls.shift();
//this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' }));
spider(nextUrl);
}
});
}
// Start spidering
casper.start(startUrl, function() {
spider(startUrl);
});
// Start the run
casper.run();
我添加了一个回调函数addUrl(myUrl)
,它使用execFile子进程并直接调用节点,并将数组中的参数作为['file-name','argument prefix','argument']
文件中定义的args.js
。< / p>