使用phantomJS和NodeJS进行刮擦

时间:2014-02-18 13:25:25

标签: node.js phantomjs screen-scraping

我正在按照此处列出的教程进行操作:

http://code.tutsplus.com/tutorials/screen-scraping-with-nodejs--net-25560

当我运行代码时:

  var host = 'http://www.shoutcast.com/?action=sub&cat=Hindi#134';
  var phantom = require('phantom');
 phantom.create(function(ph) {
 return ph.createPage(function(page) {
 return page.open(host, function(status) {
  console.log("opened site? ", status);         

        page.injectJs('http://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js', function() {
            //jQuery Loaded.
            //Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
            setTimeout(function() {
                return page.evaluate(function() {

                    //Get what you want from the page using jQuery. A good way is to populate an object with all the jQuery commands that you need and then return the object.
                    console.log(document.getElementsByClassName('transition')[0]);

                    return document.getElementsByClassName('transition')[0];



                }, function(result) {
                    console.log(result);
                    ph.exit();
                });
            }, 5000);

        });
});
});
});

我收到以下错误:

phantom stdout: ReferenceError: Can't find variable: $


phantom stdout:   phantomjs://webpage.evaluate():7
phantomjs://webpage.evaluate():10
phantomjs://webpage.evaluate():10

我不知道这意味着什么,如何解决它没有任何帮助...... 怎么解决这个问题?

基本上我想要所有'a'标签,从我正在抓取的网站进行类转换。所有这些标记都在站点上异步加载。

1 个答案:

答案 0 :(得分:3)

$是由于jQuery和可能的冲突。你几乎不需要注入jQuery只是为了用类transition来抓取'a'标签。您始终拥有document.querySelectordocument.querySelectorAll

var host = 'http://www.shoutcast.com/?action=sub&cat=Hindi#134';
var phantom = require('phantom');

phantom.create(function(ph) {
    ph.createPage(function(page) {

        page.open(host, function(status) {

            console.log("opened site? ", status);
            //Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
            setTimeout(function() {

                page.evaluate(function() {
                    // here you need to add more code to get the html/text
                    // more code incase you use querySelectorAll
                    return document.document.querySelector('a.transition');
                    //return document.document.querySelectorAll('a.transition');
                },

                function(result) {
                    console.log(result);
                    ph.exit();
                });

            }, 5000);

        });
    });
});

但是,我无法理解function (result) { console.log(result); ...}的编码方式。我不知道page.evaluate是否将回调函数作为第二个参数。请查看文档。