我正在express / node / ubuntu上运行puppeteer,如下所示:
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML)
await browser.close();
})();
});
多次运行此脚本会留下数百个僵尸:
$ pgrep chrome | wc -l
133
哪个会阻塞srv,
我该如何解决?
通过Express JS脚本运行kill
可以解决问题吗?
除了木偶戏和无头的镀铬之外,还有没有更好的方法来获得相同的结果?
答案 0 :(得分:2)
像这样在try-catch中包装代码,看看是否有帮助
headless = true;
const browser = await puppeteer.launch({headless: true, args:['--no-sandbox']});
try {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
await browser.close();
} catch (error) {
console.log(error);
await browser.close();
} finally {
await browser.close();
}
答案 1 :(得分:2)
根据我的经验,在调用close之后,浏览器关闭过程可能需要一些时间。无论如何,您可以检查浏览器进程属性,以检查它是否仍未关闭并强制杀死它。
if (browser && browser.process() != null) browser.process().kill('SIGINT');
我还在下面发布了我的伪娘资源管理器的完整代码。看看bw.on('disconnected', async () => {
const puppeteer = require('puppeteer-extra')
const randomUseragent = require('random-useragent');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36';
puppeteer.use(StealthPlugin())
function ResourceManager(loadImages) {
let browser = null;
const _this = this;
let retries = 0;
let isReleased = false;
this.init = async () => {
isReleased = false;
retries = 0;
browser = await runBrowser();
};
this.release = async () => {
isReleased = true;
if (browser) await browser.close();
}
this.createPage = async (url) => {
if (!browser) browser = await runBrowser();
return await createPage(browser,url);
}
async function runBrowser () {
const bw = await puppeteer.launch({
headless: true,
devtools: false,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--disable-gpu','--no-sandbox','--no-zygote','--disable-setuid-sandbox','--disable-accelerated-2d-canvas','--disable-dev-shm-usage', "--proxy-server='direct://'", "--proxy-bypass-list=*"]
});
bw.on('disconnected', async () => {
if (isReleased) return;
console.log("BROWSER CRASH");
if (retries <= 3) {
retries += 1;
if (browser && browser.process() != null) browser.process().kill('SIGINT');
await _this.init();
} else {
throw "===================== BROWSER crashed more than 3 times";
}
});
return bw;
}
async function createPage (browser,url) {
const userAgent = randomUseragent.getRandom();
const UA = userAgent || USER_AGENT;
const page = await browser.newPage();
await page.setViewport({
width: 1920 + Math.floor(Math.random() * 100),
height: 3000 + Math.floor(Math.random() * 100),
deviceScaleFactor: 1,
hasTouch: false,
isLandscape: false,
isMobile: false,
});
await page.setUserAgent(UA);
await page.setJavaScriptEnabled(true);
await page.setDefaultNavigationTimeout(0);
if (!loadImages) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if(req.resourceType() == 'stylesheet' || req.resourceType() == 'font' || req.resourceType() == 'image'){
req.abort();
} else {
req.continue();
}
});
}
await page.evaluateOnNewDocument(() => {
//pass webdriver check
Object.defineProperty(navigator, 'webdriver', {
get: () => false,
});
});
await page.evaluateOnNewDocument(() => {
//pass chrome check
window.chrome = {
runtime: {},
// etc.
};
});
await page.evaluateOnNewDocument(() => {
//pass plugins check
const originalQuery = window.navigator.permissions.query;
return window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications' ?
Promise.resolve({ state: Notification.permission }) :
originalQuery(parameters)
);
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
// This just needs to have `length > 0` for the current test,
// but we could mock the plugins too if necessary.
get: () => [1, 2, 3, 4, 5],
});
});
await page.evaluateOnNewDocument(() => {
// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
});
await page.goto(url, { waitUntil: 'networkidle2',timeout: 0 } );
return page;
}
}
module.exports = {ResourceManager}
答案 2 :(得分:1)
啊!这是一个简单的疏忽。您正在尝试在发送响应后 关闭浏览器。发送响应后,执行流程停止,并且您的await browser.close()
永不执行,从而使您陷入僵尸。
使用shell.js
似乎是解决此问题的一种简便方法。
您可以简单地交换两行res.send(bodyHTML)
和await browser.close()
。
但是,更好的做法是使用try..catch..finally
。原因是您希望关闭浏览器,而不管流程是否正常或抛出错误。
而且与其他代码段不同,您不必尝试在catch
块和finally
块中关闭浏览器。不论是否引发错误,始终执行finally
块。
所以,您的代码应该看起来像
const puppeteer = require('puppeteer');
const express = require('express');
const router = express.Router();
/* GET home page. */
router.get('/', function(req, res, next) {
(async () => {
try {
headless = true;
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
const bodyHTML = await page.evaluate(() => document.body.innerHTML);
res.send(bodyHTML);
} catch (e) {
console.log(e);
} finally {
await browser.close();
}
})();
});
希望这会有所帮助!
答案 3 :(得分:1)
我遇到了同样的问题,尽管您的shelljs解决方案确实起作用了,但它杀死了所有chrome进程,这可能会中断仍在处理请求的chrome进程。这是一个应该可行的更好的解决方案。
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function (req, res, next) {
(async () => {
await puppeteer.launch({ headless: true }).then(async browser => {
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
});
})();
});
答案 4 :(得分:0)
我用https://www.npmjs.com/package/shelljs
解决了var shell = require('shelljs');
shell.exec('pkill chrome')
答案 5 :(得分:0)
尝试在发送响应之前关闭浏览器
var puppeteer = require('puppeteer');
var express = require('express');
var router = express.Router();
router.get('/', function(req, res, next) {
(async () => {
headless = true;
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
url = req.query.url;
await page.goto(url);
let bodyHTML = await page.evaluate(() => document.body.innerHTML);
await browser.close();
res.send(bodyHTML);
})();
});
答案 6 :(得分:0)
我使用以下基本设置来运行 Puppeteer:
const puppeteer = require("puppeteer");
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
})()
.catch(err => console.error(err))
.finally(async () => await browser.close())
;
这里,finally
块保证浏览器将正确关闭,无论是否抛出错误。记录错误(如果需要)。我喜欢将 .catch
和 .finally
作为链式调用,因为主线 Puppeteer 代码更平坦,但这完成了同样的事情:
const puppeteer = require("puppeteer");
(async () => {
let browser;
try {
browser = await puppeteer.launch();
const [page] = await browser.pages();
/* use the page */
}
catch (err) {
console.error(err);
}
finally {
await browser.close();
}
})();
没有理由调用 newPage
,因为 Puppeteer 从打开的页面开始。
对于 Express,您只需将上面的整个代码(包括 let browser;
和不包括 require("puppeteer")
)放入您的路线中,您就可以开始使用了,尽管您可能想要使用 { {3}}。
你问:
<块引用>除了 puppeteer 和 headless chrome 之外,还有更好的方法来获得相同的结果吗?
这取决于您在做什么以及您所说的“更好”是什么意思。如果您的目标是获取 document.body.innerHTML
并且您感兴趣的页面内容已烘焙到静态 HTML 中,则您可以完全转储 Puppeteer 并仅发出获取资源的请求,然后使用 async middleware error handler提取所需信息。
另一个考虑是您可能不需要为每个请求加载和关闭整个浏览器。如果您可以为每个请求使用一个新页面,请考虑以下策略:
const express = require("express");
const puppeteer = require("puppeteer");
const asyncHandler = fn => (req, res, next) =>
Promise.resolve(fn(req, res, next)).catch(next)
;
const browserReady = puppeteer.launch({
args: ["--no-sandbox", "--disable-setuid-sandbox"]
});
const app = express();
app
.set("port", process.env.PORT || 5000)
.get("/", asyncHandler(async (req, res) => {
const browser = await browserReady;
const page = await browser.newPage();
try {
await page.goto(req.query.url || "http://www.example.com");
return res.send(await page.content());
}
catch (err) {
return res.status(400).send(err.message);
}
finally {
await page.close();
}
}))
.use((err, req, res, next) => res.sendStatus(500))
.listen(app.get("port"), () =>
console.log("listening on port", app.get("port"))
)
;
这仍然是一项繁重的工作,虽然 Puppeteer 作为子进程运行 Chromium,但您可能需要考虑将此作业卸载到任务队列(例如 Cheerio)并在后台运行。
另见: