我尝试使用PhantomJS
来从这样的网站获取数据:
app.get('/scrape', function(req, res) {
var _ph, _page;
phantom.create()
.then(function (ph) {
_ph = ph;
return ph.createPage();
})
.then(function (page) {
_page = page;
var url = "https://banc.nl/";
return page.open(url);
})
.then(function(page) {
page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js', function() {
page.evaluate(function() {
$('.sc-dnqmqq kaMmif').filter(function () {
var data = $(this);
price = data.children().first().text();
console.log("Price: " + price);
});
});
});
})
.catch(function(err) {
console.log("Error: " , err);
_page.close();
//_ph.exit();
});
});
问题是我收到以下错误:Error: TypeError: page.includeJs is not a function
如果我取消注释ph.exit()
,我也会收到警告:warn: exit() was called before waiting for commands to finish. Make sure you are not calling exit() prematurely.
我在SO中发现了几个关于此的问题,但没有解答我的问题。
答案 0 :(得分:1)
最新版本的phantomjs-node模块使用Promises,因此我们可以使用带有async
运算符的await
函数重写脚本 - 更具可读性:
var phantom = require('phantom');
var url = 'https://www.huobipro.com';
(async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
await page.on('onError', function(msg) {
console.info(msg);
});
const status = await page.open(url);
await console.log('STATUS:', status);
// Wait a bit for javascript to load and run
await new Promise(resolve => setTimeout(resolve, 3000))
await page.includeJs('https://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js');
await page.evaluate(function() {
$('span[price]').filter(function () {
var data = $(this);
console.log("Price: " + data.text());
});
});
await instance.exit();
})();