CasperJS被封锁

时间:2017-10-16 16:56:50

标签: web-scraping phantomjs web-crawler casperjs

我试图使用Casperjs构建一个刮刀,但它一直被阻挡。我读了几篇文章,说可以通过设置user-agent来避免,但即使用户代理我也被阻止了。 这是我目前的设置:

var casper = require('casper').create({
verbose: true,
logLevel: 'debug',
colorizerType: 'Dummy',
waitTimeout: 30000, // timeout for waits (loading etc.)
exitOnError: true,
pageSettings: {
    userAgent: 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.56 Safari/536.5',
    javascriptEnabled: true,
    loadImages: true,
    loadPlugins: true,
},
onError: function(msg, backtrace) {
    this.exit();
}
});

casper.start().then(function() {
    this.open('https://WEBSITE-URL', {
        headers: {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        }
    });
    casper.viewport(1280, 1024);
});
// Login
casper.then(function() {

    this.echo("Waiting for login form to load.");
    this.echo(this.getHTML());

});

我在运行casper后收到此HTML:

<!DOCTYPE html><html><head>
<meta name="ROBOTS" content="NOINDEX, NOFOLLOW">
<meta http-equiv="cache-control" content="max-age=0">
<meta http-equiv="cache-control" content="no-cache">
<meta http-equiv="expires" content="0">
<meta http-equiv="expires" content="Tue, 01 Jan 1980 1:00:00 GMT">
<meta http-equiv="pragma" content="no-cache">
<meta http-equiv="refresh" content="10; url=/distil_r_captcha.html?requestId=972f0bd8-1861-4c7b-8459-ce880b8cf2b6&amp;httpReferrer=%2F">
<script type="text/javascript">
(function(window){
    try {
        if (typeof sessionStorage !== 'undefined'){
            sessionStorage.setItem('distil_referrer', document.referrer);
        }
    } catch (e){}
})(window);
</script>
<script type="text/javascript" src="/dstltrntmls.js" defer="">
</script>
<style type="text/css">#d__fFH{position:absolute;top:-5000px;left:-5000px}#d__fF{font-family:serif;font-size:200px;visibility:hidden}#ruxctfdwzvsxvuucdvdtdtsufa{display:none!important}</style></head>
<body>
<div id="distilIdentificationBlock">&nbsp;</div>
<div id="d__fFH" style="position: absolute; top: -5000px; left: -5000px;">
<object id="d_dlg" classid="clsid:3050f819-98b5-11cf-bb82-00aa00bdce0b" width="0px" height="0px"></object>
<span id="d__fF" style="font-family: Courier, serif; font-size: 72px; visibility: hidden;">The quick brown fox jumps over the lazy dog.</span></div></body>
</html>

有没有办法解决这个问题。当我在POSTMAN中尝试一个简单的GET请求时,它会转换实际的HTML,但它不会出现在casperjs中。

0 个答案:

没有答案