使用http.request模块时有些奇怪。 写一个网络爬虫以获取和解析this webpage的数据与NodeJS是我的目的。
但 http.request 对我的回复不符合Chrome的html呈现。
这是代码。
var https = require('https');
var fs = require('fs');
var options = {
rejectUnauthorized: false,
host: 'book.flypeach.com',
path: '/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false',
};
var callback = function(response) {
var body = '';
response.on('data', function(chunk) {
body += chunk;
});
response.on('end', function() {
fs.writeFile('craw.html' , body , function(err){
if (err) return console.log(err);
});
});
}
https.request(options, callback).end();
我使用fs.writeFile来存储输出,但它与Chrome浏览器上的网页不同。
更新时间:2015/9/3
今天我尝试了phantomjs
这是我的新代码。但它也不起作用。
var system = require('system');
var page = require('webpage').create();
var url = "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false";
page.onResourceRequested = function (request) {
system.stderr.writeLine('= onResourceRequested()');
system.stderr.writeLine(' request: ' + JSON.stringify(request, undefined, 4));
};
page.onResourceReceived = function(response) {
system.stderr.writeLine('= onResourceReceived()' );
system.stderr.writeLine(' id: ' + response.id + ', stage: "' + response.stage + '", response: ' + JSON.stringify(response));
};
page.onLoadStarted = function() {
system.stderr.writeLine('= onLoadStarted()');
var currentUrl = page.evaluate(function() {
return window.location.href;
});
system.stderr.writeLine(' leaving url: ' + currentUrl);
};
page.onLoadFinished = function(status) {
system.stderr.writeLine('= onLoadFinished()');
system.stderr.writeLine(' status: ' + status);
};
page.onNavigationRequested = function(url, type, willNavigate, main) {
system.stderr.writeLine('= onNavigationRequested');
system.stderr.writeLine(' destination_url: ' + url);
system.stderr.writeLine(' type (cause): ' + type);
system.stderr.writeLine(' will navigate: ' + willNavigate);
system.stderr.writeLine(' from page\'s main frame: ' + main);
};
page.onResourceError = function(resourceError) {
system.stderr.writeLine('= onResourceError()');
system.stderr.writeLine(' - unable to load url: "' + resourceError.url + '"');
system.stderr.writeLine(' - error code: ' + resourceError.errorCode + ', description: ' + resourceError.errorString );
};
page.onError = function(msg, trace) {
system.stderr.writeLine('= onError()');
var msgStack = [' ERROR: ' + msg];
if (trace) {
msgStack.push(' TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function + '")' : ''));
});
}
system.stderr.writeLine(msgStack.join('\n'));
};
page.open(url, function(status) {
var title = page.evaluate(function() {
return document.title;
});
console.log(status);
phantom.exit();
});
我得到了详细的日志
= onNavigationRequested
destination_url: https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false
type (cause): Other
will navigate: true
from page's main frame: true
= onResourceRequested()
request: {
"headers": [
{
"name": "User-Agent",
"value": "Mozilla/5.0 (Unknown; Linux x86_64) AppleWebKit/534.34 (KHTML, like Gecko) PhantomJS/1.9.8 Safari/534.34"
},
{
"name": "Accept",
"value": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}
],
"id": 1,
"method": "GET",
"time": "2015-09-03T08:42:29.674Z",
"url": "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"
}
= onLoadStarted()
leaving url: about:blank
= onResourceError()
- unable to load url: "https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"
- error code: 6, description: SSL handshake failed
= onResourceReceived()
id: 1, stage: "end", response: {"contentType":null,"headers":[],"id":1,"redirectURL":null,"stage":"end","status":null,"statusText":null,"time":"2015-09-03T08:42:29.845Z","url":"https://book.flypeach.com/default.aspx?ao=B2CZHTW&ori=KHH&des=KIX&dep=2015-09-10&ret=2015-09-17&adt=2&chd=0&inf=0&langculture=zh-TW&bLFF=false"}
= onLoadFinished()
status: fail
fail
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://server.js. Domains, protocols and ports must match.
答案 0 :(得分:1)
您需要使用一些渲染引擎,例如WebKit
尝试使用phantomjs
http://phantomjs.org/ https://github.com/sgentle/phantomjs-node 用于命令绑定的WebKit渲染器和nodejs模块。与大多数网站合作非常好
答案 1 :(得分:0)
最明显的原因是您在网络抓取中不使用JavaScript。那些网站确实使用JavaScript来改变它的'HTML。您可以通过在Chrome中按F12来显示开发者工具,然后使用右上角的“设置”按钮并单击“禁用JavaScript”复选框,来比较网站在使用JavaScript和不使用JavaScript时的外观。
网站还可以根据参数,标题(例如用户代理)等呈现不同的HTML,但这可能不是正在发生的事情。