使用会话生成的url获取pdpe缓冲区和puppeteer

时间:2018-02-16 15:47:09

标签: javascript puppeteer

我一直在使用puppeteer尝试获取pdfs - 或者它的缓冲区响应 - 来自一个网站,该网站在点击文档链接后执行两个请求(在新标签中打开):

  1. 第一个请求(http://epicdocs.planningni.gov.uk/ViewDocument.pa?uri=4157826&ext=PDF)检索会话guid以访问文档
  2. 第二个请求(http://epicdocs.planningni.gov.uk/ViewDocument.aspx?guid=4ecd1fe5-43c6-4202-96e3-66b393fb819c)使用该guid访问文档并在浏览器上呈现pdf。
  3. 我尝试的结果是生成了一个空白的pdf,即使它是在页面加载后创建的(使用Fiddler检查)。

    我试过

    • 拦截targetcreated事件以获取页面
    • 获取第二个请求网址并使用page.goto获取pdf
    • 等待页面响应以获取缓冲区
    • Page.setDownloadBehaviour设置为允许下载而不是在浏览器中呈现

    感谢任何指导和帮助。 尝试的代码如下:

    const puppeteer = require("puppeteer");
    
    let browser;
    
    async function getDocument(index, title, page) {
      if (index != 19) return "";
      console.log("getDocument START");
      console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
      let docPagePromise = new Promise((resolve, reject) =>
        browser.once("targetcreated", async target => {
          let targetUrl = await target.url();
          if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
            console.log(targetUrl);
            return resolve(target.page());
          } else {
            console.log("Failed to detect the ViewDocument page");
          }
        })
      );
    
      /* Tried to set the download behaviour to download automatically the pdf but it didn't work */
      // await page._client.send("Page.setDownloadBehaviour", {
      //   behaviour: "allow",
      //   downloadPath: "./"
      // });
      await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
      let pdfResults = "";
      let pdfPage = await docPagePromise;
    
      /* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
      // let target = await pdfPage.target();
      // let url = await target.url();
      // let response = await pdfPage.goto(url);
      // console.log(response);
      pdfPage.on("console.log", msg => console.log(msg));
    
      /* This is never called */
      await pdfPage.on("response", async response => {
        console.log("PDF PAGE Response");
        let responseBuffer = await response.buffer();
        let responseHeaders = response.headers();
        console.log("PDF PAGE Response Header: " + responseHeaders);
        console.log("PDF PAGE Response Buffer: " + responseBuffer);
        return {
          responseHeaders,
          responseBuffer
        };
      });
      console.log(pdfResults);
    
      let pdfTitle = await pdfPage.title();
      console.log("PDFPage URL: " + pdfPage.url());
      console.log("PDFPage Title: " + pdfTitle);
    
      let pdfTarget = await pdfPage.target();
      console.log("PDFTarget URL: " + (await pdfTarget.url()));
      console.log("PDFTarget Type: " + pdfTarget.type());
      pdfPage = await pdfTarget.page();
      console.log("PDFPage URL: " + pdfPage.url());
    
      await pdfPage.waitFor(3000);
      let pdf = await pdfPage.pdf({ path: title + ".pdf" });
      console.log(pdf);
      return pdf;
    }
    
    async function getAdditionalDocumentation(page) {
      console.log("getAdditionalDocumentation START");
    
      await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
      await page.click("#repGroupSummary__ctl1_lnkGroupName");
      await page.waitForSelector("#pnlDocumentList > table > tbody > tr");
    
      await page.waitFor(2000);
    
      const documents = await page.$$eval(
        "#pnlDocumentList > table > tbody > tr",
        docs =>
          docs.map((doc, i) => ({
            type: doc.querySelector(".tdl-subgroup > span").innerText,
            datePublished: doc.querySelector(
              ".tdl-date > span[id*='DatePublished']"
            ).innerText,
            dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
              .innerText,
            docType: doc.querySelector(".tdl-doctype > span").innerText,
            description: doc.querySelector(".tdl-description > span").innerText
            // 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
          }))
      );
    
      for (let i = 0; i < documents.length; i++) {
        documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
      }
    
      await page.click("#btnSummary");
      console.log("getAdditionalDocumentation FINISH");
    
      return documents;
    }
    
    async function getDocuments(page, browser) {
      console.log("getDocuments");
      let newPagePromise = new Promise((resolve, reject) =>
        browser.once("targetcreated", async target => {
          let targetUrl = await target.url();
          if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
            console.log(targetUrl);
            return resolve(target.page());
          } else {
            console.log("Failed to detect the ShowCaseFile page");
          }
        })
      );
      await page.click("#tab_externalDocuments > span");
      await page.waitForSelector("#hp-doc-link");
    
      await page.click("#hp-doc-link");
      const newPage = await newPagePromise;
    
      const additionalDocumentation = await getAdditionalDocumentation(newPage);
    
      return {
        additionalDocumentation
      };
    }
    
    
    
    
    async function run() {
      try {
        browser = await puppeteer.launch();
        const page = await browser.newPage();
    
        page.on("console", msg => console.log("PAGE LOG:", ...msg.args));
    
        const planningReference = "LA04/2017/1388/F";
        await page.goto(
          "http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
        );
        await page.waitForSelector("#simpleSearchString");
        await page.type("#simpleSearchString", planningReference);
        await page.click("#simpleSearchForm > div.row3 > input.button.primary");
    
        await page.waitForSelector("#simpleDetailsTable");
    
        console.log("getDocuments START");
        const documents = await getDocuments(page, browser);
        console.log("getDocuments FINISH");
    
        console.log(documents);
        console.log(documents.additionalDocumentation.length);
      } finally {
        browser.close();
      }
    }
    
    run();
    

1 个答案:

答案 0 :(得分:1)

使用exposefunction将缓冲区数据写入磁盘:

page.exposeFunction("writeABString", async (strbuf, targetFile) => {
    var str2ab = function _str2ab(str) { // Convert a UTF-8 String to an ArrayBuffer

        var buf = new ArrayBuffer(str.length); // 1 byte for each char
        var bufView = new Uint8Array(buf);

        for (var i=0, strLen=str.length; i < strLen; i++) {
          bufView[i] = str.charCodeAt(i);
        }
        return buf;
    }

    console.log("In 'writeABString' function...");

    return new Promise((resolve, reject) => {

        // Convert the ArrayBuffer string back to an ArrayBufffer, which in turn is converted to a Buffer
        let buf = Buffer.from(str2ab(strbuf));

        // Try saving the file.        
        fs.writeFile(targetFile, buf, (err, text) => {
            if(err) reject(err);
            else resolve(targetFile);
        });
    });
});

使用下载链接,您可以将其与fetch api配合使用,将其作为blob进行转换,并将其转换为:

page.evaluate( async () => {

function arrayBufferToString(buffer){ // Convert an ArrayBuffer to an UTF-8 String

    var bufView = new Uint8Array(buffer);
    var length = bufView.length;
    var result = '';
    var addition = Math.pow(2,8)-1;

    for(var i = 0;i<length;i+=addition){
        if(i + addition > length){
            addition = length - i;
        }
        result += String.fromCharCode.apply(null, bufView.subarray(i,i+addition));
    }
    return result;
}

   let geturl = "https://whateverurl.example.com";

   return fetch(geturl, {
        credentials: 'same-origin', // usefull when we are logged into a website and want to send cookies
        responseType: 'arraybuffer', // get response as an ArrayBuffer
   })
   .then(response => response.arrayBuffer())
   .then( arrayBuffer => {
        var bufstring = arrayBufferToString(arrayBuffer);
        return window.writeABString(bufstring, '/tmp/downloadtest.pdf');
   })
   .catch(function (error) {
        console.log('Request failed: ', error);
   });
 });

有关更多信息,请在github puppeteer页面上查看此问题。该问题还提出了上述解决方案。 Source