我偶尔需要访问HTML页面来更新数据库。可以通过Web浏览器轻松访问此页面,但是当我尝试通过node.js应用程序访问该页面时,该页面不起作用(网站检测到请求是由机器人发出的)。但是,
user-agent
)是网络浏览器所请求的。referer
标头或cookie
标头,但浏览器请求中都没有。在我看来,机器人请求和浏览器请求严格相同。但是,它们的处理方式不同。
我的想法已经用完了……也许请求包含诸如“此请求是由node.js发送的”之类的元数据,但这确实很奇怪。
编辑,这是一个代码示例:
// callback (error, responseContent)
function getPage (callback){
let options = {
protocol : 'https:',
hostname : 'xxx.yyy.fr',
port : 443,
path : '/abc/def',
agent : false,
headers : {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Cache-Control' : 'no-cache',
'Connection' : 'keep-alive',
'DNT' : '1',
'Host' : 'ooshop.carrefour.fr',
'Pragma' : 'no-cache',
'Upgrade-Insecure-Requests' : '1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
}
};
https.get (options, function (res){
if (res.statusCode !== 200){
res.resume ();
callback ('Error : res code != 200, res code = ' + res.statusCode);
return;
}
res.setEncoding ('utf-8');
let content = '';
res.on ('data', chunk => content += chunk);
res.on ('end', () => callback (null, content));
}).on ('error', e => callback (e));
}
编辑:这是请求/响应的比较:
Mozilla Firefox
请求标头:
GET /3274080001005/eau-de-source-cristaline HTTP/1.1
Host: ooshop.carrefour.fr
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3
Accept-Encoding: gzip, deflate, br
DNT: 1
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Pragma: no-cache
Cache-Control: no-cache
响应头:
HTTP/2.0 200 OK
date: Wed, 11 Jul 2018 21:25:25 GMT
server: Unknown
content-type: text/html; charset=UTF-8
age: 0
x-varnish-cache: MISS
accept-ranges: bytes
set-cookie: visid_incap_1213048=G8a0mWzmQYi0GKuT2Ht7YeQ9QVsAAAAAQkIPAAAAAADvVZnsZHK18dQQxHakBprg; expires=Thu, 11 Jul 2019 11:17:56 GMT; path=/; Domain=.carrefour.fr
incap_ses_466_1213048=/2NKHS4HXU0T7FpkwpJ3BsV1RlsAAAAAAY3wbUkXacAceu2NkgUrhw==; path=/; Domain=.carrefour.fr
x-iinfo: 7-11020186-11020187 NNNN CT(1 2 0) RT(1531344324722 0) q(0 0 0 0) r(4 4) U12
x-cdn: Incapsula
content-encoding: gzip
X-Firefox-Spdy: h2
响应内容:预期的HTML页面
Node.js机器人
请求标头:
Accept : text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Encoding : gzip, deflate, br
Accept-Language : fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3
Cache-Control : no-cache
Connection : keep-alive
DNT : 1
Host : ooshop.carrefour.fr
Pragma : no-cache
Upgrade-Insecure-Requests : 1
User-Agent : Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0
响应头:
content-type : text/html
connection : close, close
cache-control : no-cache
content-length : 210
x-iinfo : 1-17862634-0 0NNN RT(1531344295049 65) q(0 -1 -1 0) r(0 -1) B10(4,314,0) U19
set-cookie : incap_ses_466_1213048=j34jMBWkPFYT7FpkwpJ3Bqd1RlsAAAAAVBfoZBShAvoun/M8UFxPPA==; path=/; Domain=.carrefour.fr
回复内容:
<html>
<head>
<META NAME="robots" CONTENT="noindex,nofollow">
<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">
</script>
<body>
</body></html>