Heyllo,大家!
我正在制作一个使用node-simplecrawler的刮刀。一切都运行良好,但我无法弄清楚如何在创建一个实例时停止一个实例(我希望一次只运行一个实例)。我正在使用express,所有的抓取逻辑都在一条路线上。为了立即取消抓取,我需要停止节点进程并再次运行应用程序。
以下是关于运行抓取工具的代码部分(注意:我已经简化了一些代码,因此它更短):
module.exports = function(socket) {
var express = require('express');
var router = express.Router();
[... requires continue...]
/* GET scaning page. */
router.post('/', function(req, res, next) {
res.render('scanning'); // Load the socket.io host page
var render = {};
var pages = [];
var timer = new Date();
// Helper func to log the requests.
function log(message) {
var now = new Date();
console.log(now - timer + 'ms', message);
timer = now;
}
// Ensure URL format, parse URL
// Check if URL exist
request(url.href, function (error, response, body) {
if (!error && response.statusCode == 200) {
// URL exists, so let's scan it
// Exclude links to the following extensions:
var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp',
'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip',
'rar', '7z', 'css', 'js', 'gzip', 'exe', 'xml', 'svg'];
var exts = exclude.join('|');
var regexReject = new RegExp('\.(' + exts + ')', 'i');
var rootURL = url.protocol + '//' + url.host + '/';
// Crawler configuration
var crawler = new Crawler(url.host);
crawler.initialPort = 80;
crawler.initialPath = url.pathname;
crawler.maxConcurrency = 1;
crawler.ignoreWWWDomain = false; // This is a little suspicious...
crawler.filterByDomain = true; // Only URLs from the current domain
crawler.scanSubdomains = true;
crawler.downloadUnsupported = false;
crawler.parseHTMLComments = false;
crawler.parseScriptTags = false;
crawler.acceptCookies = false;
// crawler.maxDepth = 1 // Debug only!
/*
* Fetch Conditions
*/
// Get only URLs, ignore feeds, only from this host
crawler.addFetchCondition(function (parsedURL) {
return (
!parsedURL.path.match(regexReject) && // Only links
(parsedURL.path.search('/feed') === -1) && // Igrnore feeds
(parsedURL.host === url.host) // Page is from this domain
);
});
// Should we only include subpages?
if(onlySubpages) {
crawler.addFetchCondition(function(parsedURL) {
// return parsedURL.path.search(url.pathname) > -1;
return parsedURL.path.startsWith(url.pathname);
// console.log(url, parsedURL);
});
}
// Exclude urls with fragments?
if(excludeUrls.length >= 1 ) {
crawler.addFetchCondition(function(parsedURL) {
var urlFragmentsOk = true;
excludeUrlFragments.forEach(function(fragment) {
if(parsedURL.path.search('/'+fragment) > -1) {
urlFragmentsOk = false;
}
});
return urlFragmentsOk;
});
}
// Include only URLs with fragments
if(includeOnlyUrls.length >= 1) {
crawler.addFetchCondition(function(parsedURL) {
var urlFragmentsOk = false;
var includeUrlFragments = includeOnlyUrls.replace(/\s/, '').split(',');
includeUrlFragments.forEach(function(fragment) {
if(parsedURL.path.search('/'+fragment) !== -1) {
urlFragmentsOk = true;
}
});
return urlFragmentsOk;
});
}
// Run the crawler
crawler.start();
// Execute for each URL, on fetchcomplete
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
[Do stuff with the scraped page]
});
// Completed crawling. Now let's get to work!
crawler.on('complete', function() {
[Get all scraped pages and do something with them]
});
// Error handling
crawler.on('queueerror', function(errorData, URLData) {
console.log('Queue error:', errorData, URLData);
});
crawler.on('fetchdataerror', function(queueitem, response) {
console.log('Fetch error:', queueitem, response);
});
crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
console.log('Fetch timeout:', queueItem, crawlerTimeoutValue);
});
crawler.on('fetchclienterror', function(queueItem, errorData) {
console.log('Fetch local error:', queueItem, errorData);
});
crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
console.log('Crawler timeout:', queueItem, crawlerTimeoutValue);
});
} else if(error) {
console.log(error);
}
});
});
return router;
}
答案 0 :(得分:2)
每个simplecrawler实例都有一个stop
方法,可以调用该方法来阻止抓取工具发出任何进一步的请求(但请求不会在飞行中停止)。
我可能会将爬虫实例存储在路由处理程序之外的范围内,检查它是否在路由处理程序中定义了第一个东西,在这种情况下调用stop
方法然后构造一个新的刮刀。
我删除了很多代码的内容,但是我想到的就是这样:
module.exports = function(socket) {
var express = require('express');
var router = express.Router();
var Crawler = requrie('simplecrawler');
var crawler;
router.post('/', function(req, res, next) {
// Check if URL exist
request(url.href, function (error, response, body) {
if (!error && response.statusCode == 200) {
// Stop any crawler that's already running
if (crawler instanceof Crawler) {
crawler.stop();
}
// Crawler configuration
crawler = new Crawler(url.host);
crawler.initialPort = 80;
crawler.initialPath = url.pathname;
// Run the crawler
crawler.start();
// Execute for each URL, on fetchcomplete
crawler.on('fetchcomplete', function(item, responseBuffer, response) {
// [Do stuff with the scraped page]
});
// Completed crawling. Now let's get to work!
crawler.on('complete', function() {
// [Get all scraped pages and do something with them]
});
} else if(error) {
console.log(error);
}
});
});
return router;
}