如何修复X射线(NodeJS抓取库)响应中的编码?

时间:2015-11-13 10:09:37

标签: node.js web-scraping x-ray

以下脚本在我的NodeJS服务器中运行得很好,但是当我试图搜索一些西里尔语网站时,很少会返回这样的响应。

脚本

x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
        ogImage: 'meta[property="og:image"]@content',
        twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
  })
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }

编码错误的回复示例

firstData { name: [ '(Rock, Pop) [15LP] [24/96] Queen - Studio Collection - 2015, 
                     FLAC (tracks) :: RuTracker.org' ],
  description:
   [ 'RuTracker.org » ���������� ��� (����������� ���������) » 
                      ������� ������� (Rock, Pop) [15LP] [24/96] Queen - 
                      Studio Collection - 2015, FLAC (tracks)',
                      undefined ],
  image: [ undefined, undefined, undefined, undefined, undefined, undefined ] }

我该如何解决这个问题?

1 个答案:

答案 0 :(得分:0)

您可以将request用作X射线的驱动程序,并在其中对正文进行图标处理:

var options = {};
var conv = null;
options.encoding = 'binary';
iconv = new require('iconv').Iconv('Windows-1251', 'utf8');
conv = function(body) {
    if (!body) return body;
    body = new Buffer.from(body, 'binary');
    return iconv.convert(body).toString();
}

var request = require('request').defaults(options);
var driver = function driver(context, callback) {
    var url = context.url;
    request(url, function(err, response, body) {
        if (!err && conv) body = conv(body);
        return callback(err, body);
    })
};
x.driver(driver);


x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
    ogImage: 'meta[property="og:image"]@content',
    twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
})
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }
    console.log(firstData);

});