Question

我正在尝试抓取下面提到的网址并尝试使用PhantomJS提取标记中的链接。该链接发回javascript代码，该代码在浏览器上执行以在Body内部形成HTML。问题是PhantomJS没有执行Javascript，我在执行代码后得到了未完成的HTML。

Index.js

var page = require('webpage').create();
console.log('The default user agent is ' + page.settings.userAgent);
page.settings.userAgent = 'SpecialAgent';
page.open('http://videohost.site/play/A11QStEaNdVZfvV/', function(status) {
  if (status !== 'success') {
    console.log('Unable to access network');
  } else {
    var html = page.evaluate(function() {
      return document.getElementsByTagName('body')[0].innerHTML;
    });
    console.log(html);
  }
  phantom.exit();
});

我在控制台中使用phantomjs index.js执行它来检查HTML。关于如何执行内部脚本的任何帮助将非常感激。

更新：根据Vaviloff的建议，与NightMareJS合作。

app.get('/scrape', function (req, res) {

  //All the web scraping magic will happen here
  console.log('Scrape')
  url = 'http://videohost.site/play/A11QStEaNdVZfvV/'
  nightmare
    .goto(url)
    .evaluate(function(){

        return document.body.innerHTML; //pass all of the html as text
    })
    .end()   
    .then(function (document) {
      console.log(document)
    })
    .catch(function (error) {
      console.error('Search failed:', error);
    })

  res.send('Woohoo')

})
app.listen('8081')

console.log('Magic happens on port 8081');

exports = module.exports = app;

PhantomJS在标签中执行javascript

0 个答案: