PhantomJS在localhost上返回状态200,但在实时服务器

时间:2016-08-10 00:44:50

标签: phantomjs http-status-code-403 akamai

我必须从给定的URL中删除HTML文档。在我的localhost上,Phantom JS脚本正在返回url。但在实时服务器上,我获得了403禁止状态

scraper.js

var system = require('system');
var page = require('webpage').create();

$url = system.args[1];

page.open($url, function(status) {


    if (status == "success") {

        var content = page.content;
        console.log(content);
    }

    phantom.exit();

});

PhantomJS命令:

phantomjs scraper.js http://www.submarino.com.br/produto/126862765/

刮刀在其他页面上工作正常。但域名www.submarino.com.br和www.americanas.com.br不起作用。我知道它与Akamai有关。带错误输出的响应是:

Response (#1, stage "start"): {"body":"","bodySize":300,"contentType":"text/html","headers":[{"name":"Server","value":"AkamaiGHost"},{"name":"Mime-Version","value":"1.0"},{"name":"Content-Type","value":"text/html"},{"name":"Content-Length","value":"300"},{"name":"Expires","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Connection","value":"close"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"start","status":403,"statusText":"Forbidden","time":"2016-08-10T00:38:13.540Z","url":"http://www.submarino.com.br/produto/126862765/"}
Response (#1, stage "end"): {"contentType":"text/html","headers":[{"name":"Server","value":"AkamaiGHost"},{"name":"Mime-Version","value":"1.0"},{"name":"Content-Type","value":"text/html"},{"name":"Content-Length","value":"300"},{"name":"Expires","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:38:13 GMT"},{"name":"Connection","value":"close"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"end","status":403,"statusText":"Forbidden","time":"2016-08-10T00:38:13.541Z","url":"http://www.submarino.com.br/produto/126862765/"}

当它正常工作时返回:

Response (#1, stage "start"): {"body":"","bodySize":30076,"contentType":"text/html;charset=UTF-8","headers":[{"name":"Content-Encoding","value":"gzip"},{"name":"Content-Type","value":"text/html;charset=UTF-8"},{"name":"Server","value":"Apache-Coyote/1.1"},{"name":"X-Powered-By","value":"JSF/1.2"},{"name":"x-tid","value":"CATALOGO-0d4d336f-c0f1-4b71-9663-28fa89b5c123"},{"name":"Cache-Control","value":"max-age=1800"},{"name":"Expires","value":"Wed, 10 Aug 2016 01:10:18 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:40:18 GMT"},{"name":"Connection","value":"keep-alive"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"start","status":200,"statusText":"OK","time":"2016-08-10T00:40:18.388Z","url":"http://www.submarino.com.br/produto/126862765/"}
Response (#1, stage "end"): {"contentType":"text/html;charset=UTF-8","headers":[{"name":"Content-Encoding","value":"gzip"},{"name":"Content-Type","value":"text/html;charset=UTF-8"},{"name":"Server","value":"Apache-Coyote/1.1"},{"name":"X-Powered-By","value":"JSF/1.2"},{"name":"x-tid","value":"CATALOGO-0d4d336f-c0f1-4b71-9663-28fa89b5c123"},{"name":"Cache-Control","value":"max-age=1800"},{"name":"Expires","value":"Wed, 10 Aug 2016 01:10:18 GMT"},{"name":"Date","value":"Wed, 10 Aug 2016 00:40:18 GMT"},{"name":"Connection","value":"keep-alive"},{"name":"Set-Cookie","value":"MobileOptOut=1; path=/; domain=submarino.com.br\nb2wChannel=INTERNET; path=/; domain=submarino.com.br"},{"name":"Vary","value":"Accept-Encoding, User-Agent"}],"id":1,"redirectURL":null,"stage":"end","status":200,"statusText":"OK","time":"2016-08-10T00:40:18.390Z","url":"http://www.submarino.com.br/produto/126862765/"}

我尝试从hurl.it和其他cURL服务转发此网站,他们可以访问该网址。有什么我可以做的吗?这让我发疯了!

1 个答案:

答案 0 :(得分:2)

很可能是地理或可疑的IP范围限制。我刚刚试图打开网址,也被拒绝了该页面,然后通过美国代理访问它并能够打开它。只需使用美国或巴西代理。

另外,在抓取模仿真实浏览器行为的重要性时,我建议您在脚本中添加useragent和viewport模拟:

page.viewportSize = { width: 1280, height: 800 };
page.settings.userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";

还要确保订阅错误和控制台消息,以了解目标页面中的任何错误和消息。

page.onConsoleMessage = function(msg) {
  console.log('CONSOLE: ' + msg);
};

page.onError = function (msg, trace) 
{
    console.log(msg);
    trace.forEach(function(item) {
        console.log(' ', item.file, ':', item.line);
    })
}