尝试测试页面https://publicindex.sccourts.org/anderson/publicindex/ 使用标准浏览器导航到该页面时,导航会在请求的页面(https://publicindex.sccourts.org/anderson/publicindex/)处结束,并且该页面显示一个“接受”按钮。
但是,在无头模式下使用人偶进行测试时,请求将重定向到https://publicindex.sccourts.org。
我对正在发生的事情有一个大概的了解,但是当使用伪娘请求页面时,似乎无法阻止重定向到https://publicindex.sccourts.org。 我认为这是用户控制的浏览器所发生的事情:
页面请求已发送。 (假设首次访问)
响应是纯JS,
js代码指定为:
复制初始页面请求标头
添加特定的标头,然后重新请求同一页面(xhr)
从其中一个响应标头复制一个网址,并替换位置
(或)
检查页面历史记录,
将响应页面的网址添加到历史记录中,
打开一个新窗口,
将xhr响应写入新页面
关闭新窗口
在返回的xhr请求中为函数添加事件监听器
触发事件
我曾尝试使用puppeteer跟踪js,记录har,监视cookie,监视请求链,拦截页面请求和调整标题,监视历史记录等。我很困惑。
这是操纵up脚本的最基本版本:
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
puppeteer.launch({headless: true}).then(async browser => {
const page = await browser.newPage();
await page.setJavaScriptEnabled(true);
await page.setViewport({width: 1920, height: 1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
const har = new PuppeteerHar(page);
await har.start({path: 'results.har'});
const response = await page.goto(url);
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
console.log(bodyHTML);
});
};
run();
当我导航到chrome页面并在“ accept”页面上结束导航时,为什么不能让puppeteer简单地复制js正在执行的过程?
这是一个日志记录更详细的版本:
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
puppeteer.launch().then(async browser => {
const page = await browser.newPage();
await page.setJavaScriptEnabled(true);
await page.setViewport({width:1920,height:1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
await page.setRequestInterception(true);
page.on('frameattached', frame =>{ console.log('frame attached ');});
page.on('framedetached', frame =>{ console.log('frame detached ');});
page.on('framenavigated', frame =>{ console.log('frame navigated '); });
page.on('requestfailed', req =>{ console.log('request failed ');});
page.on('requestfinished', req =>{ console.log('frame finished '); console.log(req.url())});
let count = 0;
let headers = '';
page.on('request', interceptedRequest => {
console.log('requesting ' + count + 'times');
console.log('request for ' + interceptedRequest.url());
console.log(interceptedRequest);
if (count>2) {
interceptedRequest.abort();
return;
}
if (interceptedRequest.url() == url) {
count++;
if (count == 1) {
const headers = interceptedRequest.headers();
headers['authority'] = 'publicindex.sccourts.org';
headers['sec-fetch-dest'] = 'empty';
headers['sec-fetch-mode'] = 'cors';
headers['sec-fetch-site'] = 'same-origin';
headers['upgrade-insecure-requests'] = '1';
interceptedRequest.continue({headers});
return;
} else {
interceptedRequest.continue();
return;
}
}
count++;
interceptedRequest.continue();
return;
});
const har = new PuppeteerHar(page);
await har.start({ path: 'results.har' });
await page.tracing.start({path: 'trace.json'});
await Promise.all([page.coverage.startJSCoverage({reportAnonymousScripts : true})]);
const response = await page.goto(url);
const session = await page.target().createCDPSession();
await session.send('Page.enable');
await session.send('Page.setWebLifecycleState', {state: 'active'});
const jsCoverage = await Promise.all([page.coverage.stopJSCoverage()]);
console.log(jsCoverage);
const chain = response.request().redirectChain();
console.log(chain + "\n\n");
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
console.log(bodyHTML);
});
};
run();
答案 0 :(得分:1)
我没有完整的解决方案,但我知道重定向发生在哪里。
我通过以下方式在本地测试了您的脚本:
const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
function run () {
let url = 'https://publicindex.sccourts.org/anderson/publicindex/';
puppeteer.launch({headless: false, devtools: true }).then(async browser => {
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
console.log('GOT NEW REQUEST', request.url());
request.continue();
});
page.on('response', response => {
console.log('GOT NEW RESPONSE', response.status(), response.headers());
});
await page.setJavaScriptEnabled(true);
await page.setViewport({width: 1920, height: 1280});
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36');
const har = new PuppeteerHar(page);
await har.start({path: 'results.har'});
const response = await page.goto(url);
await page.waitForNavigation();
await har.stop();
let bodyHTML = await page.content();
});
};
run();
我编辑了三个部分:
require
进口,因为它伤了我的眼睛。我总是看到他们打来的电话没有嵌套显示页面https://publicindex.sccourts.org/anderson/publicindex/
到https://publicindex.sccourts.org/
的请求
但是,此请求将302重定向返回到https://www.sccourts.org/caseSearch/
的位置,因此浏览器会相应地执行操作
我会尝试调查这个奇怪的请求是否合法以及为什么重定向到chrome puppeteer上
这post可能会有所帮助,可能与铬被认为不安全有关
我还尝试将args: ['--disable-web-security', '--allow-running-insecure-content']
传递给launch()对象参数,但没有结果
请告诉我们情况如何! Har很高兴被发现!