在创建新实例时停止node-simplecrawler实例(使其行为像单例)

时间:2016-03-07 12:52:02

标签: node.js express

Heyllo,大家!

我正在制作一个使用node-simplecrawler的刮刀。一切都运行良好,但我无法弄清楚如何在创建一个实例时停止一个实例(我希望一次只运行一个实例)。我正在使用express,所有的抓取逻辑都在一条路线上。为了立即取消抓取,我需要停止节点进程并再次运行应用程序。

以下是关于运行抓取工具的代码部分(注意:我已经简化了一些代码,因此它更短):

module.exports = function(socket) {
  var express = require('express');
  var router = express.Router();

  [... requires continue...]

  /* GET scaning page. */
  router.post('/', function(req, res, next) {
    res.render('scanning'); // Load the socket.io host page
    var render = {};
    var pages = [];
    var timer = new Date();
    // Helper func to log the requests.
    function log(message) {
      var now = new Date();
      console.log(now - timer + 'ms', message);
      timer = now;
    }

    // Ensure URL format, parse URL

    // Check if URL exist
    request(url.href, function (error, response, body) {
      if (!error && response.statusCode == 200) {
        // URL exists, so let's scan it
        // Exclude links to the following extensions:
        var exclude = ['gif', 'jpg', 'jpeg', 'png', 'ico', 'bmp', 'ogg', 'webp',
          'mp4', 'webm', 'mp3', 'ttf', 'woff', 'json', 'rss', 'atom', 'gz', 'zip',
          'rar', '7z', 'css', 'js', 'gzip', 'exe', 'xml', 'svg'];
        var exts = exclude.join('|');
        var regexReject = new RegExp('\.(' + exts + ')', 'i');
        var rootURL = url.protocol + '//' + url.host + '/';

        // Crawler configuration
        var crawler = new Crawler(url.host);
        crawler.initialPort = 80;
        crawler.initialPath = url.pathname;
        crawler.maxConcurrency = 1;
        crawler.ignoreWWWDomain = false; // This is a little suspicious...
        crawler.filterByDomain = true; // Only URLs from the current domain
        crawler.scanSubdomains = true;
        crawler.downloadUnsupported = false;
        crawler.parseHTMLComments = false;
        crawler.parseScriptTags = false;
        crawler.acceptCookies = false;
        // crawler.maxDepth = 1 // Debug only!

        /*
         * Fetch Conditions
         */

        // Get only URLs, ignore feeds, only from this host
        crawler.addFetchCondition(function (parsedURL) {
          return (
            !parsedURL.path.match(regexReject) && // Only links
            (parsedURL.path.search('/feed') === -1) && // Igrnore feeds
            (parsedURL.host === url.host) // Page is from this domain
          );
        });

        // Should we only include subpages?
        if(onlySubpages) {
          crawler.addFetchCondition(function(parsedURL) {
            // return parsedURL.path.search(url.pathname) > -1;
            return parsedURL.path.startsWith(url.pathname);
            // console.log(url, parsedURL);
          });
        }

        // Exclude urls with fragments?
        if(excludeUrls.length >= 1 ) {
          crawler.addFetchCondition(function(parsedURL) {
            var urlFragmentsOk = true;

            excludeUrlFragments.forEach(function(fragment) {
              if(parsedURL.path.search('/'+fragment) > -1) {
                urlFragmentsOk = false;
              }
            });

            return urlFragmentsOk;
          });
        }

        // Include only URLs with fragments
        if(includeOnlyUrls.length >= 1) {
          crawler.addFetchCondition(function(parsedURL) {
            var urlFragmentsOk = false;
            var includeUrlFragments = includeOnlyUrls.replace(/\s/, '').split(',');

            includeUrlFragments.forEach(function(fragment) {
              if(parsedURL.path.search('/'+fragment) !== -1) {
                urlFragmentsOk = true;
              }
            });
            return urlFragmentsOk;
          });
        }

        // Run the crawler
        crawler.start();

        // Execute for each URL, on fetchcomplete
        crawler.on('fetchcomplete', function(item, responseBuffer, response) {
          [Do stuff with the scraped page]
        });

        // Completed crawling. Now let's get to work!
        crawler.on('complete', function() {
          [Get all scraped pages and do something with them]
        });

        // Error handling
        crawler.on('queueerror', function(errorData, URLData) {
          console.log('Queue error:', errorData, URLData);
        });

        crawler.on('fetchdataerror', function(queueitem, response) {
          console.log('Fetch error:', queueitem, response);
        });

        crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
          console.log('Fetch timeout:', queueItem, crawlerTimeoutValue);
        });

        crawler.on('fetchclienterror', function(queueItem, errorData) {
          console.log('Fetch local error:', queueItem, errorData);
        });

        crawler.on('fetchtimeout', function(queueItem, crawlerTimeoutValue) {
          console.log('Crawler timeout:', queueItem, crawlerTimeoutValue);
        });

      } else if(error) {
        console.log(error);
      }
    });
  });
  return router;
}

1 个答案:

答案 0 :(得分:2)

每个simplecrawler实例都有一个stop方法,可以调用该方法来阻止抓取工具发出任何进一步的请求(但请求不会在飞行中停止)。

我可能会将爬虫实例存储在路由处理程序之外的范围内,检查它是否在路由处理程序中定义了第一个东西,在这种情况下调用stop方法然后构造一个新的刮刀。

我删除了很多代码的内容,但是我想到的就是这样:

module.exports = function(socket) {
  var express = require('express');
  var router = express.Router();
  var Crawler = requrie('simplecrawler');

  var crawler;

  router.post('/', function(req, res, next) {
    // Check if URL exist
    request(url.href, function (error, response, body) {
      if (!error && response.statusCode == 200) {

        // Stop any crawler that's already running
        if (crawler instanceof Crawler) {
            crawler.stop();
        }

        // Crawler configuration
        crawler = new Crawler(url.host);
        crawler.initialPort = 80;
        crawler.initialPath = url.pathname;

        // Run the crawler
        crawler.start();

        // Execute for each URL, on fetchcomplete
        crawler.on('fetchcomplete', function(item, responseBuffer, response) {
            // [Do stuff with the scraped page]
        });

        // Completed crawling. Now let's get to work!
        crawler.on('complete', function() {
            // [Get all scraped pages and do something with them]
        });

      } else if(error) {
        console.log(error);
      }
    });
  });

  return router;
}