尝试抓取一个网站,该网站包含在页面加载后运行的脚本,但JsDom似乎没有这样做,我可以使用zombie.js(它使用jsDom)。我不想使用zombie js,因为我不需要所有的测试框架。
这是jsDom代码:
const request = require('request');
const jsdom = require('jsdom');
const cheerio = require('cheerio');
const domain = 'https://www.google.co.uk'
const usa = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36';
module.exports = function(cb) {
const url = `${domain}/#q=monkeys`;
request({ uri: url, headers: { 'User-Agent': usa } }, function (err, res, body) {
if (err && res.statusCode !== 200) throw err;
const window = jsdom.jsdom(body, {
url: url,
userAgent: usa,
features: {
FetchExternalResources: ['script'],
ProcessExternalResources: ['script']
}
}).defaultView;
window.addEventListener('load', () => {
// console.log(window.document.body.innerHTML)
// This HTML has scripts.
let $ = cheerio.load(window.document);
let els = $('SELECTOR').map(function () {
// Other code
}).get();
cb(url, els);
window.close();
});
});
}
这是zombie js代码的作用:
const Browser = require('zombie');
const cheerio = require('cheerio');
const domain = 'https://www.google.co.uk'
const usa = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36';
module.exports = function(cb) {
const url = `${domain}/#q=monkeys`;
const browser = new Browser({userAgent: usa});
browser.visit(url, () => {
browser.wait(window => window.document.getElementById('rhs'), function() {
let $ = cheerio.load(browser.html());
let els = $('SELECTOR').map(function () {
// Other code
}).get();
cb(url, els)
browser.window.close();
});
});
}