Puppeteer和PhantomJS相似。我俩都遇到了这个问题,代码也很相似。
我想从网站上获取一些信息,该网站需要进行身份验证才能查看这些信息。我什至无法访问主页,因为它被检测为“可疑活动”,例如SS:https://i.imgur.com/p69OIjO.png
我发现,当我使用名为 Cookie 的标头在Postman上进行测试并且该cookie的值在浏览器中被捕获时,该问题不会发生,但是此cookie会在一段时间后过期。因此,我猜Puppeteer / PhantomJS都没有捕获cookie,因为该站点拒绝了无头的浏览器访问。
我该怎么做才能绕过这个?
// Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';
page.open(url, function (status) {
if( status === "success") {
page.render("home.png");
phantom.exit();
}
});
答案 0 :(得分:2)
您要访问的网站使用Distil Networks来防止网页抓取。
人们通过替换Chromium的success in the past(在Puppeteer中使用)中的$cdc_
变量,call_function.js绕过了Distil Networks。
例如:
function getPageCache(opt_doc, opt_w3c) {
var doc = opt_doc || document;
var w3c = opt_w3c || false;
// var key = '$cdc_asdjflasutopfhvcZLmcfl_'; <-- This is the line that is changed.
var key = '$something_different_';
if (w3c) {
if (!(key in doc))
doc[key] = new CacheWithUUID();
return doc[key];
} else {
if (!(key in doc))
doc[key] = new Cache();
return doc[key];
}
}
注意:根据this comment,如果在进行此更改之前已被列入黑名单,则您将面临另一组挑战,因此您必须“实施假画布指纹识别,禁用Flash,更改IP,并更改请求标头顺序(交换语言和接受标头)。”
答案 1 :(得分:1)
如果从网站角度考虑,您确实在做可疑的工作。因此,每当您想绕开这种事情时,请务必考虑一下他们的想法。
Puppeteer和PhantomJS等将使用真实的浏览器,并且在那里使用的cookie比通过邮递员等使用时要好。您只需要正确使用cookie。
您可以使用page.setCookie(...cookies)
设置cookie。 Cookies是序列化的,因此,如果Cookies是对象数组,则只需执行此操作,
const cookies = [{name: 'test', value: 'foo'}, {name: 'test2', value: 'foo'}]; // just as example, use real cookies here;
await page.setCookie(...cookies);
关闭无头模式,然后查看网站的行为。
await puppeteer.launch({headless: false})
某些网站基于IP地址进行监控,如果多个匹配来自同一IP,则会阻止该请求。在这种情况下,最好使用旋转代理。
答案 2 :(得分:1)
总的来说可以帮助您的事情:
答案 3 :(得分:1)
如果将来有人需要同样的问题。 使用puppeteer-extra
我已经在服务器上测试了代码。在第二次运行中,有Google Captcha。您可以自行解决问题,然后重新启动机器人或使用验证码解决服务。
我的代码运行了10次以上,没有ip禁止。继续运行时,我再也没有收到验证码。
但是您可以再次获得验证码!
//sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]
const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
async function run () {
const browser = await puppeteer.launch({
headless:(headless_mode !== 'true')? false : true,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--window-size=1400,900',
'--remote-debugging-port=9222',
"--remote-debugging-address=0.0.0.0", // You know what your doing?
'--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
]})
const page = await browser.newPage();
console.log(`Testing expertflyer.com`)
//await page.goto('https://www.expertflyer.com')
await goto_Page('https://www.expertflyer.com')
await waitForNetworkIdle(page, 3000, 0)
//await page.waitFor(7000)
await checking_error(do_2nd_part)
async function do_2nd_part(){
try{await page.click('#yui-gen2 > a')}catch{}
await page.waitFor(5000)
var seat = '#headerTitleContainer > h1'
try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
await page.screenshot({ path: 'expertflyer1.png'})
await checking_error(do_3nd_part)
}
async function do_3nd_part(){
try{await page.click('#yui-gen1 > a')}catch{}
await page.waitFor(5000)
var pro = '#headerTitleContainer > h1'
try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
await page.screenshot({ path: 'expertflyer2.png'})
console.log(`All done, check the screenshots?`)
}
async function checking_error(callback){
try{
try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}
if (error_found === 0) {
console.log(`Error found`)
var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}
if (error_msg_details == captcha_msg) {
console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)
await verify_User_answer()
await callback()
} else if (error_msg_details == ip_blocked) {
console.log(`The current ip address is blocked. The only way is change the ip address.`)
} else {
console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
await page.waitFor(10000)
await checking_error()
}
} else {
console.log(`Page loaded successfully! You can do things here.`)
await callback()
}
}catch{}
}
async function goto_Page(page_URL){
try{
await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
} catch {
console.log(`Error in loading page, re-trying...`)
await goto_Page(page_URL)
}
}
async function verify_User_answer(call_back){
user_Answer = await readLine();
if (user_Answer == 'yes') {
console.log(`user_Answer is ${user_Answer}, Processing...`)
// Not working what i want. Will fix later
// Have to restart the bot after solving
await call_back()
} else {
console.log(`answer not match. try again...`)
var user_Answer = await readLine();
console.log(`user_Answer is ${user_Answer}`)
await verify_User_answer(call_back)
}
}
async function readLine() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
return new Promise(resolve => {
rl.question('Solve the captcha and type yes to continue: ', (answer) => {
rl.close();
resolve(answer)
});
})
}
async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
console.log('waitForNetworkIdle called')
page.on('request', onRequestStarted);
page.on('requestfinished', onRequestFinished);
page.on('requestfailed', onRequestFinished);
let inflight = 0;
let fulfill;
let promise = new Promise(x => fulfill = x);
let timeoutId = setTimeout(onTimeoutDone, timeout);
return promise;
function onTimeoutDone() {
page.removeListener('request', onRequestStarted);
page.removeListener('requestfinished', onRequestFinished);
page.removeListener('requestfailed', onRequestFinished);
fulfill();
}
function onRequestStarted() {
++inflight;
if (inflight > maxInflightRequests)
clearTimeout(timeoutId);
}
function onRequestFinished() {
if (inflight === 0)
return;
--inflight;
if (inflight === maxInflightRequests)
timeoutId = setTimeout(onTimeoutDone, timeout);
}
}
await browser.close()
}
run();
请注意“解决验证码并输入yes继续:”该方法无法正常工作,需要进行一些修复。
编辑:10分钟后再次获得验证码,然后重新运行bot。在chrome://inspect/#devices
上解决的验证码重新启动了漫游器,一切再次正常。没有IP禁令。