NodeJS中的Web爬虫 - 加密页面等

时间:2014-06-28 17:54:09

标签: node.js web-crawler

我试图从网站中提取一些数据,所以我在NodeJS中编写了一种网络爬虫。

问题是该页面正在返回HTML编码,如下所示:

</�◄♣b∟�♣66c���▬�inji�g��-��~\��r>

这是什么以及如何解决问题&#34;?

更新

var http = require('follow-redirects').http;

// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {

    var request_options = {
        host: url,
        headers: {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
        },
        path: '/'
    };

    http.get(url, function (res) {
        var data = "";
        res.on('data', function (chunk) {
            data += chunk;
        });
        res.on("end", function () {
            callback(data);
        });
    }).on("error", function () {
        callback(null);
    });
}

var cheerio = require("cheerio");

var baseUrl = "http://www.mensagemaniversario.com.br";

var fs = require("fs");

var Entities = require('html-entities').AllHtmlEntities;

entities = new Entities();

var page = 1;

function recursiveDownload(url) {
    download(url, function (data) {
        if (data) {
            // console.log(data);
            var $ = cheerio.load(data);
            console.log($.html());
            $("div.box-list > p").each(function (i, e) {
                var frase = $(e);


                fs.appendFile("dados.xml", "<frase>" + entities.decode(frase.html()) + "</frase>");
            });

            console.log("Processado página " + url);
            var linkNextPage = $("div.pagination a:last-child");
            if (linkNextPage.length > 0) {
                page++;
                recursiveDownload(baseUrl + linkNextPage.attr("href"));
            }
        }
    });
}

recursiveDownload(baseUrl + '/15-anos');

1 个答案:

答案 0 :(得分:0)

旧帖子,但为了让其他人更容易寻找相同的答案,这是因为网页被压缩(例如压缩)。

以下帖子有对我有用的补救措施。

VB.NET: How to retrieve a compressed web page to a string?