使用无头铬来拦截图像请求数据

时间:2017-08-08 22:05:07

标签: node.js google-chrome headless-browser google-chrome-headless

我有一个用例,需要使用无头Chrome网络(https://chromedevtools.github.io/devtools-protocol/tot/Network/)拦截所有图像请求,并在保存之前找出图片大小(基本上丢弃图标等小图片)。

但是,在保存之前,我无法找到将图像数据加载到内存中的方法。我需要在Img对象中加载它以获得widthheightNetwork.getResponseBody正在接收我无法访问Network.requestIntercepted的requestId。 Network.loadingFinished总是给我" 0"在encodedDataLength变量中。我不知道为什么。所以我的问题是:

  1. 如何截取来自jpg / png请求的所有响应并获取图像数据?不通过URL字符串将文件保存到磁盘并加载回来。

  2. BEST:如何从标头响应中获取图像尺寸?然后,我根本不必将数据读入内存。

  3. 我的代码如下:

    const chromeLauncher = require('chrome-launcher');
    const CDP = require('chrome-remote-interface');
    const file = require('fs');
    
    (async function() {
      async function launchChrome() {
        return await chromeLauncher.launch({
          chromeFlags: [
            '--disable-gpu',
            '--headless'
          ]
        });
      }
      const chrome = await launchChrome();
      const protocol = await CDP({
        port: chrome.port
      });
    
      const {
        DOM,
        Network,
        Page,
        Emulation,
        Runtime
      } = protocol;
      await Promise.all([Network.enable(), Page.enable(), Runtime.enable(), DOM.enable()]);
    
      await Network.setRequestInterceptionEnabled({enabled: true});
      Network.requestIntercepted(({interceptionId, request, resourceType}) => {
        if ((request.url.indexOf('.jpg') >= 0) || (request.url.indexOf('.png') >= 0)) {
          console.log(JSON.stringify(request));
          console.log(resourceType);
    
          if (request.url.indexOf("/unspecified.jpg") >= 0) {
            console.log("FOUND unspecified.jpg");
            console.log(JSON.stringify(interceptionId));
            // console.log(JSON.stringify(Network.getResponseBody(interceptionId)));
          }
    
        }
        Network.continueInterceptedRequest({interceptionId});
      });
    
      Network.loadingFinished(({requestId, timestamp, encodedDataLength}) => {
        console.log(requestId);
        console.log(timestamp);
        console.log(encodedDataLength);
      });
    
      Page.navigate({
        url: 'https://www.yahoo.com/'
      });
    
      Page.loadEventFired(async() => {
        protocol.close();
        chrome.kill(); 
      });
    
    })();
    

1 个答案:

答案 0 :(得分:0)

这应该可以让你获得90%的胜利。它获取每个图像请求的正文。您仍然需要base64decode,检查大小并保存等...

const CDP = require('chrome-remote-interface');

const sizeThreshold = 1024;

async function run() {
  try {
    var client = await CDP();
    const { Network, Page } = client;

    // enable events
    await Promise.all([Network.enable(), Page.enable()]);

    // commands
    const _url = "https://google.co.za";
    let _pics = [];
    Network.responseReceived(async ({requestId, response}) => {
      let url = response ? response.url : null;
      if ((url.indexOf('.jpg') >= 0) || (url.indexOf('.png') >= 0)) {
        const {body, base64Encoded} = await Network.getResponseBody({ requestId }); // throws promise error returning null/undefined so can't destructure. Must be different in inspect shell to app?
        _pics.push({ url, body, base64Encoded });
        console.log(url, body, base64Encoded);
      }
    });
    await Page.navigate({ url: _url });
    await sleep(5000);

    // TODO: process _pics - base64Encoded, check body.length > sizeThreshold, save etc...

  } catch (err) {
    if (err.message && err.message === "No inspectable targets") {
      console.error("Either chrome isn't running or you already have another app connected to chrome - e.g. `chrome-remote-interface inspect`")
    } else {
      console.error(err);
    }
  } finally {
    if (client) {
      await client.close();
    }
  }
}

function sleep(miliseconds = 1000) {
  if (miliseconds == 0)
    return Promise.resolve();
  return new Promise(resolve => setTimeout(() => resolve(), miliseconds))
}

run();