木偶不关闭浏览器

时间:2018-12-27 03:43:53

标签: puppeteer

我正在express / node / ubuntu上运行puppeteer,如下所示:

var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();

/* GET home page. */
router.get('/', function(req, res, next) {
    (async () => {
        headless = true;
        const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
        const page = await browser.newPage();
        url = req.query.url;
        await page.goto(url);
        let bodyHTML = await page.evaluate(() => document.body.innerHTML);
        res.send(bodyHTML)
        await browser.close();
    })();
});

多次运行此脚本会留下数百个僵尸:

$ pgrep chrome | wc -l
133

哪个会阻塞srv,

我该如何解决?

通过Express JS脚本运行kill可以解决问题吗?

除了木偶戏和无头的镀铬之外,还有没有更好的方法来获得相同的结果?

7 个答案:

答案 0 :(得分:2)

像这样在try-catch中包装代码,看看是否有帮助

headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
try {
  const page = await browser.newPage();
  url = req.query.url;
  await page.goto(url);
  let bodyHTML = await page.evaluate(() => document.body.innerHTML);
  res.send(bodyHTML);
  await browser.close();
} catch (error) {
  console.log(error);
  await browser.close();
} finally {
  await browser.close();
}

答案 1 :(得分:2)

根据我的经验,在调用close之后,浏览器关闭过程可能需要一些时间。无论如何,您可以检查浏览器进程属性,以检查它是否仍未关闭并强制杀死它。

if (browser && browser.process() != null) browser.process().kill('SIGINT');

我还在下面发布了我的伪娘资源管理器的完整代码。看看bw.on('disconnected', async () => {

const puppeteer = require('puppeteer-extra')
const randomUseragent = require('random-useragent');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')

const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
puppeteer.use(StealthPlugin())

function ResourceManager(loadImages) {
    let browser = null;
    const _this = this;
    let retries = 0;
    let isReleased = false;

    this.init = async () => {
        isReleased = false;
        retries = 0;
        browser = await runBrowser();
    };

    this.release = async () => {
        isReleased = true;
        if (browser) await browser.close();
    }

    this.createPage = async (url) => {
        if (!browser) browser = await runBrowser();
        return await createPage(browser,url);
    }

    async function runBrowser () {
        const bw = await puppeteer.launch({
            headless: true,
            devtools: false,
            ignoreHTTPSErrors: true,
            slowMo: 0,
            args: ['--disable-gpu','--no-sandbox','--no-zygote','--disable-setuid-sandbox','--disable-accelerated-2d-canvas','--disable-dev-shm-usage', "--proxy-server='direct://'", "--proxy-bypass-list=*"]
        });

        bw.on('disconnected', async () => {
            if (isReleased) return;
            console.log("BROWSER CRASH");
            if (retries <= 3) {
                retries += 1;
                if (browser && browser.process() != null) browser.process().kill('SIGINT');
                await _this.init();
            } else {
                throw "===================== BROWSER crashed more than 3 times";
            }
        });

        return bw;
    }

    async function createPage (browser,url) {
        const userAgent = randomUseragent.getRandom();
        const UA = userAgent || USER_AGENT;
        const page = await browser.newPage();
        await page.setViewport({
            width: 1920 + Math.floor(Math.random() * 100),
            height: 3000 + Math.floor(Math.random() * 100),
            deviceScaleFactor: 1,
            hasTouch: false,
            isLandscape: false,
            isMobile: false,
        });
        await page.setUserAgent(UA);
        await page.setJavaScriptEnabled(true);
        await page.setDefaultNavigationTimeout(0);
        if (!loadImages) {
            await page.setRequestInterception(true);
            page.on('request', (req) => {
                if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
                    req.abort();
                } else {
                    req.continue();
                }
            });
        }

        await page.evaluateOnNewDocument(() => {
            //pass webdriver check
            Object.defineProperty(navigator, 'webdriver', {
                get: () => false,
            });
        });

        await page.evaluateOnNewDocument(() => {
            //pass chrome check
            window.chrome = {
                runtime: {},
                // etc.
            };
        });

        await page.evaluateOnNewDocument(() => {
            //pass plugins check
            const originalQuery = window.navigator.permissions.query;
            return window.navigator.permissions.query = (parameters) => (
                parameters.name === 'notifications' ?
                    Promise.resolve({ state: Notification.permission }) :
                    originalQuery(parameters)
            );
        });

        await page.evaluateOnNewDocument(() => {
            // Overwrite the `plugins` property to use a custom getter.
            Object.defineProperty(navigator, 'plugins', {
                // This just needs to have `length > 0` for the current test,
                // but we could mock the plugins too if necessary.
                get: () => [1, 2, 3, 4, 5],
            });
        });

        await page.evaluateOnNewDocument(() => {
            // Overwrite the `plugins` property to use a custom getter.
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en'],
            });
        });

        await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
        return page;
    }
}

module.exports = {ResourceManager}

答案 2 :(得分:1)

啊!这是一个简单的疏忽。您正在尝试在发送响应后 关闭浏览器。发送响应后,执行流程停止,并且您的await browser.close()永不执行,从而使您陷入僵尸。

使用shell.js似乎是解决此问题的一种简便方法。

您可以简单地交换两行res.send(bodyHTML)await browser.close()

但是,更好的做法是使用try..catch..finally。原因是您希望关闭浏览器,而不管流程是否正常或抛出错误。 而且与其他代码段不同,您不必尝试在catch块和finally块中关闭浏览器。不论是否引发错误,始终执行finally块。

所以,您的代码应该看起来像

const puppeteer = require('puppeteer');
const express = require('express');

const router = express.Router();

/* GET home page. */
router.get('/', function(req, res, next) {
  (async () => {
    try {
      headless = true;
      const browser = await puppeteer.launch({
        headless: true,
        args: ['--no-sandbox'],
      });
      const page = await browser.newPage();
      url = req.query.url;
      await page.goto(url);
      const bodyHTML = await page.evaluate(() => document.body.innerHTML);
      res.send(bodyHTML);
    } catch (e) {
      console.log(e);
    } finally {
      await browser.close();
    }
  })();
});

希望这会有所帮助!

答案 3 :(得分:1)

我遇到了同样的问题,尽管您的shelljs解决方案确实起作用了,但它杀死了所有chrome进程,这可能会中断仍在处理请求的chrome进程。这是一个应该可行的更好的解决方案。

var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();

router.get('/', function (req, res, next) {
    (async () => {
        await puppeteer.launch({ headless: true }).then(async browser => {
            const page = await browser.newPage();
            url = req.query.url;
            await page.goto(url);
            let bodyHTML = await page.evaluate(() => document.body.innerHTML);
            await browser.close();
            res.send(bodyHTML);
        });
    })();
});

答案 4 :(得分:0)

我用https://www.npmjs.com/package/shelljs

解决了
var shell = require('shelljs');
shell.exec('pkill chrome')

答案 5 :(得分:0)

尝试在发送响应之前关闭浏览器

var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();

router.get('/', function(req, res, next) {
    (async () => {
        headless = true;
        const browser = await puppeteer.launch({headless: true});
        const page = await browser.newPage();
        url = req.query.url;
        await page.goto(url);
        let bodyHTML = await page.evaluate(() => document.body.innerHTML);
        await browser.close();
        res.send(bodyHTML);
    })();
});

答案 6 :(得分:0)

我使用以下基本设置来运行 Puppeteer:

const puppeteer = require("puppeteer");

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();

  /* use the page */
  
})()
  .catch(err => console.error(err))
  .finally(async () => await browser.close())
;

这里,finally 块保证浏览器将正确关闭,无论是否抛出错误。记录错误(如果需要)。我喜欢将 .catch.finally 作为链式调用,因为主线 Puppeteer 代码更平坦,但这完成了同样的事情:

const puppeteer = require("puppeteer");

(async () => {
  let browser;

  try {
    browser = await puppeteer.launch();
    const [page] = await browser.pages();

    /* use the page */
  }
  catch (err) {
    console.error(err);
  }
  finally {
    await browser.close();
  }
})();

没有理由调用 newPage,因为 Puppeteer 从打开的页面开始。


对于 Express,您只需将上面的整个代码(包括 let browser; 和不包括 require("puppeteer"))放入您的路线中,您就可以开始使用了,尽管您可能想要使用 { {3}}。

你问:

<块引用>

除了 puppeteer 和 headless chrome 之外,还有更好的方法来获得相同的结果吗?

这取决于您在做什么以及您所说的“更好”是什么意思。如果您的目标是获取 document.body.innerHTML 并且您感兴趣的页面内容已烘焙到静态 HTML 中,则您可以完全转储 Puppeteer 并仅发出获取资源的请求,然后使用 async middleware error handler提取所需信息。

另一个考虑是您可能不需要为每个请求加载和关闭整个浏览器。如果您可以为每个请求使用一个新页面,请考虑以下策略:

const express = require("express");
const puppeteer = require("puppeteer");

const asyncHandler = fn => (req, res, next) =>
  Promise.resolve(fn(req, res, next)).catch(next)
;
const browserReady = puppeteer.launch({
  args: ["--no-sandbox", "--disable-setuid-sandbox"]
});

const app = express();
app
  .set("port", process.env.PORT || 5000)
  .get("/", asyncHandler(async (req, res) => {
    const browser = await browserReady;
    const page = await browser.newPage();

    try {
      await page.goto(req.query.url || "http://www.example.com");
      return res.send(await page.content());
    }
    catch (err) {
      return res.status(400).send(err.message);
    }
    finally {
      await page.close();
    }
  }))
  .use((err, req, res, next) => res.sendStatus(500))
  .listen(app.get("port"), () =>
    console.log("listening on port", app.get("port"))
  )
;

这仍然是一项繁重的工作,虽然 Puppeteer 作为子进程运行 Chromium,但您可能需要考虑将此作业卸载到任务队列(例如 Cheerio)并在后台运行。

另见: