Web抓取(请求 - > JsDom - > Cheerio)无法使用SPA(客户端脚本)

时间:2016-08-01 14:09:29

标签: node.js web-scraping cheerio jsdom

尝试抓取一个网站,该网站包含在页面加载后运行的脚本,但JsDom似乎没有这样做,我可以使用zombie.js(它使用jsDom)。我不想使用zombie js,因为我不需要所有的测试框架。

这是jsDom代码:

const request = require('request');
const jsdom = require('jsdom');
const cheerio = require('cheerio');
const domain = 'https://www.google.co.uk'
const usa = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36';

module.exports = function(cb) {
  const url = `${domain}/#q=monkeys`;
   request({ uri: url, headers: { 'User-Agent': usa } }, function (err, res, body) {
      if (err && res.statusCode !== 200) throw err;
      const window = jsdom.jsdom(body, { 
        url: url,
        userAgent: usa,
        features: {
          FetchExternalResources: ['script'],
          ProcessExternalResources: ['script']
        }
      }).defaultView;
      window.addEventListener('load', () => {
        // console.log(window.document.body.innerHTML)
        // This HTML has scripts.
        let $ = cheerio.load(window.document);
        let els = $('SELECTOR').map(function () {
          // Other code
        }).get();    
        cb(url, els);
        window.close();   
     });
  });
}

这是zombie js代码的作用:

const Browser = require('zombie');
const cheerio = require('cheerio');
const domain = 'https://www.google.co.uk'
const usa = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36';

module.exports = function(cb) {
  const url = `${domain}/#q=monkeys`;
  const browser = new Browser({userAgent: usa});
  browser.visit(url, () => {
    browser.wait(window => window.document.getElementById('rhs'), function() {
      let $ = cheerio.load(browser.html());
      let els = $('SELECTOR').map(function () {
        // Other code
      }).get();
      cb(url, els)
      browser.window.close();
    });
  });
}

0 个答案:

没有答案