我试图从网站中提取一些数据,所以我在NodeJS中编写了一种网络爬虫。
问题是该页面正在返回HTML编码,如下所示:
</�◄♣b∟�♣66c���▬�inji�g��-��~\��r>
这是什么以及如何解决问题&#34;?
var http = require('follow-redirects').http;
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
var request_options = {
host: url,
headers: {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
},
path: '/'
};
http.get(url, function (res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function () {
callback(data);
});
}).on("error", function () {
callback(null);
});
}
var cheerio = require("cheerio");
var baseUrl = "http://www.mensagemaniversario.com.br";
var fs = require("fs");
var Entities = require('html-entities').AllHtmlEntities;
entities = new Entities();
var page = 1;
function recursiveDownload(url) {
download(url, function (data) {
if (data) {
// console.log(data);
var $ = cheerio.load(data);
console.log($.html());
$("div.box-list > p").each(function (i, e) {
var frase = $(e);
fs.appendFile("dados.xml", "<frase>" + entities.decode(frase.html()) + "</frase>");
});
console.log("Processado página " + url);
var linkNextPage = $("div.pagination a:last-child");
if (linkNextPage.length > 0) {
page++;
recursiveDownload(baseUrl + linkNextPage.attr("href"));
}
}
});
}
recursiveDownload(baseUrl + '/15-anos');
答案 0 :(得分:0)
旧帖子,但为了让其他人更容易寻找相同的答案,这是因为网页被压缩(例如压缩)。
以下帖子有对我有用的补救措施。