排队异步任务

时间:2018-09-09 19:07:53

标签: node.js puppeteer

我正在尝试创建一个脚本,该脚本需要一个URL列表,然后转到网站并获取屏幕截图。

我设法使它与木偶戏一起工作。但是我遇到的问题是,当我在列表中说50个URL时,它将尝试一次为所有URL启动木偶会话,这意味着在网站加载之前会花费大量时间,并且可以截取屏幕截图。 / p>

我发现我可以一次成功运行10个,所以我想建立一个排队系统来完成此操作。

parser.on('readable', function(){
  while(record = parser.read()){
      counter +=1;
      console.log(record.URL);


      (async (url = record.URL, name = record.shortURL, counter1 = counter) => {
      const browser = await puppeteer.launch( {defaultViewport: {width: 1024, height:768} } );
      const page = await browser.newPage();
      await page.goto(url);
      title = await page.title();
      domainRegex = /^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n?]+)/img;
      match = domainRegex.exec(url);

      width = 1024;//await page.viewport().width;
      height = 1000;//await page.viewport.height();
      await page.screenshot({path: "Screenshots/"+counter1+". "+match[1] + "- " +title.replace(/[\W_]+/g,"")+".jpg", clip : {x:0, y:0, width: width, height: height}});

      await browser.close();    
      })();

  }
});

4 个答案:

答案 0 :(得分:2)

如果要串行运行它们,可以将其转换为异步函数并等待。这样,它将一个接一个地运行。

// let's separate it for readability
async function getRecord(record, counter) {
    const url = record.URL,
        name = record.shortURL,
        counter1 = counter;
    const browser = await puppeteer.launch({
        defaultViewport: {
            width: 1024,
            height: 768
        }
    });
    const page = await browser.newPage();
    await page.goto(url);
    title = await page.title();
    domainRegex = /^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n?]+)/img;
    match = domainRegex.exec(url);

    width = 1024; //await page.viewport().width;
    height = 1000; //await page.viewport.height();
    await page.screenshot({
        path: "Screenshots/" + counter1 + ". " + match[1] + "- " + title.replace(/[\W_]+/g, "") + ".jpg",
        clip: {
            x: 0,
            y: 0,
            width: width,
            height: height
        }
    });

    await browser.close();
}

parser.on('readable', async function() { // <-- here we make it async
    while (record = parser.read()) {
        counter += 1;
        console.log(record.URL);
        await getRecord(record, counter) // <-- and we await each call
    }
});

还有诸如Promise.mapfor..of之类的其他方式,但让我们暂时简化一下。

答案 1 :(得分:2)

以下代码最初将启动10个会话。每个会话结束后,它将使下一个记录出队并启动另一个记录,直到没有剩余的记录为止。这样可以确保最多同时运行10个。

parser.on('readable', async () => {
    const maxNumberOfSessions = 10;
    let counter = 0;

    await Promise.all(Array.from({length: maxNumberOfSessions}, dequeueRecord));
    console.log("All records have been processed.");

    function dequeueRecord() {
        const nextRecord = parser.read();
        if(nextRecord) return processRecord(nextRecord).then(dequeueRecord);
    }

    async function processRecord(record) {
        const number = ++counter;
        console.log("Processing record #" + number + ": " + record.URL);

        const browser = await puppeteer.launch({defaultViewport: {width: 1024, height: 768}});
        const page = await browser.newPage();
        await page.goto(record.URL);
        const title = await page.title();
        const domainRegex = /^(?:https?:\/\/)?(?:[^@\n]+@)?(?:www\.)?([^:\/\n?]+)/img;
        const match = domainRegex.exec(record.URL);

        const width = 1024; // await page.viewport().width;
        const height = 1000; // await page.viewport().height;
        await page.screenshot({path: "Screenshots/" + number + ". " + match[1] + "- " + title.replace(/[\W_]+/g, "") + ".jpg", clip: {x: 0, y: 0, width, height}});

        await browser.close();    
    }
});

答案 2 :(得分:1)

如果您要依次运行一组承诺,则可以使用Bluebird软件包中的Promise.mapSeries。我知道这意味着要添加一个额外的程序包,但这很简单,不需要您构建排队系统。

http://bluebirdjs.com/docs/api/promise.mapseries.html

答案 3 :(得分:1)

您可能想看看puppeteer-cluster(免责声明:我是作者)。

您可以这样做:

var str = "res=[xyz=name,abc=address]";
str = str.split("res=")[1]
  .replace("[",'{"')
  .replace("]",'"}')
  .replace(/=/g,'":"')
  .replace(/,/g,'","');

res = JSON.parse(str);
console.log(str,"\n",res);

这将处理10个并行浏览器实例,还将处理浏览器崩溃和错误处理。