我一直在使用puppeteer尝试获取pdfs - 或者它的缓冲区响应 - 来自一个网站,该网站在点击文档链接后执行两个请求(在新标签中打开):
我尝试的结果是生成了一个空白的pdf,即使它是在页面加载后创建的(使用Fiddler检查)。
我试过
targetcreated
事件以获取页面page.goto
获取pdf Page.setDownloadBehaviour
设置为允许下载而不是在浏览器中呈现感谢任何指导和帮助。 尝试的代码如下:
const puppeteer = require("puppeteer");
let browser;
async function getDocument(index, title, page) {
if (index != 19) return "";
console.log("getDocument START");
console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
let docPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ViewDocument page");
}
})
);
/* Tried to set the download behaviour to download automatically the pdf but it didn't work */
// await page._client.send("Page.setDownloadBehaviour", {
// behaviour: "allow",
// downloadPath: "./"
// });
await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
let pdfResults = "";
let pdfPage = await docPagePromise;
/* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
// let target = await pdfPage.target();
// let url = await target.url();
// let response = await pdfPage.goto(url);
// console.log(response);
pdfPage.on("console.log", msg => console.log(msg));
/* This is never called */
await pdfPage.on("response", async response => {
console.log("PDF PAGE Response");
let responseBuffer = await response.buffer();
let responseHeaders = response.headers();
console.log("PDF PAGE Response Header: " + responseHeaders);
console.log("PDF PAGE Response Buffer: " + responseBuffer);
return {
responseHeaders,
responseBuffer
};
});
console.log(pdfResults);
let pdfTitle = await pdfPage.title();
console.log("PDFPage URL: " + pdfPage.url());
console.log("PDFPage Title: " + pdfTitle);
let pdfTarget = await pdfPage.target();
console.log("PDFTarget URL: " + (await pdfTarget.url()));
console.log("PDFTarget Type: " + pdfTarget.type());
pdfPage = await pdfTarget.page();
console.log("PDFPage URL: " + pdfPage.url());
await pdfPage.waitFor(3000);
let pdf = await pdfPage.pdf({ path: title + ".pdf" });
console.log(pdf);
return pdf;
}
async function getAdditionalDocumentation(page) {
console.log("getAdditionalDocumentation START");
await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
await page.click("#repGroupSummary__ctl1_lnkGroupName");
await page.waitForSelector("#pnlDocumentList > table > tbody > tr");
await page.waitFor(2000);
const documents = await page.$$eval(
"#pnlDocumentList > table > tbody > tr",
docs =>
docs.map((doc, i) => ({
type: doc.querySelector(".tdl-subgroup > span").innerText,
datePublished: doc.querySelector(
".tdl-date > span[id*='DatePublished']"
).innerText,
dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
.innerText,
docType: doc.querySelector(".tdl-doctype > span").innerText,
description: doc.querySelector(".tdl-description > span").innerText
// 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
}))
);
for (let i = 0; i < documents.length; i++) {
documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
}
await page.click("#btnSummary");
console.log("getAdditionalDocumentation FINISH");
return documents;
}
async function getDocuments(page, browser) {
console.log("getDocuments");
let newPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ShowCaseFile page");
}
})
);
await page.click("#tab_externalDocuments > span");
await page.waitForSelector("#hp-doc-link");
await page.click("#hp-doc-link");
const newPage = await newPagePromise;
const additionalDocumentation = await getAdditionalDocumentation(newPage);
return {
additionalDocumentation
};
}
async function run() {
try {
browser = await puppeteer.launch();
const page = await browser.newPage();
page.on("console", msg => console.log("PAGE LOG:", ...msg.args));
const planningReference = "LA04/2017/1388/F";
await page.goto(
"http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
);
await page.waitForSelector("#simpleSearchString");
await page.type("#simpleSearchString", planningReference);
await page.click("#simpleSearchForm > div.row3 > input.button.primary");
await page.waitForSelector("#simpleDetailsTable");
console.log("getDocuments START");
const documents = await getDocuments(page, browser);
console.log("getDocuments FINISH");
console.log(documents);
console.log(documents.additionalDocumentation.length);
} finally {
browser.close();
}
}
run();
答案 0 :(得分:1)
使用exposefunction将缓冲区数据写入磁盘:
page.exposeFunction("writeABString", async (strbuf, targetFile) => {
var str2ab = function _str2ab(str) { // Convert a UTF-8 String to an ArrayBuffer
var buf = new ArrayBuffer(str.length); // 1 byte for each char
var bufView = new Uint8Array(buf);
for (var i=0, strLen=str.length; i < strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}
console.log("In 'writeABString' function...");
return new Promise((resolve, reject) => {
// Convert the ArrayBuffer string back to an ArrayBufffer, which in turn is converted to a Buffer
let buf = Buffer.from(str2ab(strbuf));
// Try saving the file.
fs.writeFile(targetFile, buf, (err, text) => {
if(err) reject(err);
else resolve(targetFile);
});
});
});
使用下载链接,您可以将其与fetch api配合使用,将其作为blob进行转换,并将其转换为:
page.evaluate( async () => {
function arrayBufferToString(buffer){ // Convert an ArrayBuffer to an UTF-8 String
var bufView = new Uint8Array(buffer);
var length = bufView.length;
var result = '';
var addition = Math.pow(2,8)-1;
for(var i = 0;i<length;i+=addition){
if(i + addition > length){
addition = length - i;
}
result += String.fromCharCode.apply(null, bufView.subarray(i,i+addition));
}
return result;
}
let geturl = "https://whateverurl.example.com";
return fetch(geturl, {
credentials: 'same-origin', // usefull when we are logged into a website and want to send cookies
responseType: 'arraybuffer', // get response as an ArrayBuffer
})
.then(response => response.arrayBuffer())
.then( arrayBuffer => {
var bufstring = arrayBufferToString(arrayBuffer);
return window.writeABString(bufstring, '/tmp/downloadtest.pdf');
})
.catch(function (error) {
console.log('Request failed: ', error);
});
});
有关更多信息,请在github puppeteer页面上查看此问题。该问题还提出了上述解决方案。 Source