NodeJS:不期望http.request的响应

时间:2015-09-02 11:02:26

标签: javascript node.js http web-crawler

使用http.request模块时有些奇怪。 写一个网络爬虫以获取和解析this webpage的数据与NodeJS是我的目的。

http.request 对我的回复不符合Chrome的html呈现。

这是代码。

var https = require('https');
var fs = require('fs');

var options = {
    rejectUnauthorized: false,
    host: 'book.flypeach.com',
    path: '/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false',
};

var callback = function(response) {
    var body = '';
    response.on('data', function(chunk) {
        body += chunk;
    });

    response.on('end', function() {
        fs.writeFile('craw.html' , body , function(err){
          if (err) return console.log(err);
        });
    });
}
https.request(options, callback).end();

我使用fs.writeFile来存储输出,但它与Chrome浏览器上的网页不同。

更新时间:2015/9/3

今天我尝试了phantomjs

这是我的新代码。但它也不起作用。

var system = require('system');
var page = require('webpage').create();
var url = "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false";

page.onResourceRequested = function (request) {
    system.stderr.writeLine('= onResourceRequested()');
    system.stderr.writeLine('  request: ' + JSON.stringify(request, undefined, 4));
};

page.onResourceReceived = function(response) {
    system.stderr.writeLine('= onResourceReceived()' );
    system.stderr.writeLine('  id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response));
};

page.onLoadStarted = function() {
    system.stderr.writeLine('= onLoadStarted()');
    var currentUrl = page.evaluate(function() {
        return window.location.href;
    });
    system.stderr.writeLine('  leaving url: ' + currentUrl);
};

page.onLoadFinished = function(status) {
    system.stderr.writeLine('= onLoadFinished()');
    system.stderr.writeLine('  status: ' + status);
};

page.onNavigationRequested = function(url, type, willNavigate, main) {
    system.stderr.writeLine('= onNavigationRequested');
    system.stderr.writeLine('  destination_url: ' + url);
    system.stderr.writeLine('  type (cause): ' + type);
    system.stderr.writeLine('  will navigate: ' + willNavigate);
    system.stderr.writeLine('  from page\'s main frame: ' + main);
};

page.onResourceError = function(resourceError) {
    system.stderr.writeLine('= onResourceError()');
    system.stderr.writeLine('  - unable to load url: "' + resourceError.url + '"');
    system.stderr.writeLine('  - error code: ' + resourceError.errorCode + ', description: ' + resourceError.errorString );
};

page.onError = function(msg, trace) {
    system.stderr.writeLine('= onError()');
    var msgStack = ['  ERROR: ' + msg];
    if (trace) {
        msgStack.push('  TRACE:');
        trace.forEach(function(t) {
            msgStack.push('    -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function + '")' : ''));
        });
    }
    system.stderr.writeLine(msgStack.join('\n'));
};

page.open(url, function(status) {
  var title = page.evaluate(function() {
    return document.title;
  });

  console.log(status);
  phantom.exit();
});

我得到了详细的日志

= onNavigationRequested
  destination_url: https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false
  type (cause): Other
  will navigate: true
  from page's main frame: true
= onResourceRequested()
  request: {
    "headers": [
        {
            "name": "User-Agent",
            "value": "Mozilla/5.0 (Unknown; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34"
        },
        {
            "name": "Accept",
            "value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
        }
    ],
    "id": 1,
    "method": "GET",
    "time": "2015-09-03T08:42:29.674Z",
    "url": "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"
}
= onLoadStarted()
  leaving url: about:blank
= onResourceError()
  - unable to load url: "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"
  - error code: 6, description: SSL handshake failed
= onResourceReceived()
  id: 1, stage: "end", response: {"contentType":null,"headers":[],"id":1,"redirectURL":null,"stage":"end","status":null,"statusText":null,"time":"2015-09-03T08:42:29.845Z","url":"https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"}
= onLoadFinished()
  status: fail
fail
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.

2 个答案:

答案 0 :(得分:1)

您需要使用一些渲染引擎,例如WebKit

尝试使用phantomjs

http://phantomjs.org/ https://github.com/sgentle/phantomjs-node 用于命令绑定的WebKit渲染器和nodejs模块。与大多数网站合作非常好

答案 1 :(得分:0)

最明显的原因是您在网络抓取中不使用JavaScript。那些网站确实使用JavaScript来改变它的'HTML。您可以通过在Chrome中按F12来显示开发者工具,然后使用右上角的“设置”按钮并单击“禁用JavaScript”复选框,来比较网站在使用JavaScript和不使用JavaScript时的外观。

网站还可以根据参数,标题(例如用户代理)等呈现不同的HTML,但这可能不是正在发生的事情。