我用node.js
和puppeteer
编写了一个脚本,以将links
解析为网页中所有帖子的标题,并使用这些links
进行导航其内页可刮取标题。
本来可以从其登录页面刮取标题,但我的意图是使用那些新填充的链接进行导航并从目标页面解析标题。当我执行脚本时,它会刮掉第一个标题,然后引发错误。遵循我尝试应用的逻辑,如何使它成功。
Link to one of such target pages
到目前为止,这是我的脚本:
const puppeteer = require("puppeteer");
(async function main() {
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&pageSize=50");
page.waitForSelector(".summary");
const sections = await page.$$(".summary");
for (const section of sections) {
const itemName = await section.$eval(".question-hyperlink", item => item.href);
(async function main() {
await page.goto(itemName);
page.waitForSelector(".summary");
const titles = await page.$$("#question-header");
for (const title of titles) {
const itmName = await title.$eval("#question-header .question-hyperlink", itm => itm.innerText);
console.log(itmName);
}
})();
}
browser.close();
})();
我在控制台中看到的内容:
(node:1992) UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
at rewriteError (c:\Users\WCS\node_modules\puppeteer\lib\ExecutionContext.js:144:15)
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:189:7)
(node:1992) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:1992) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
How to search content related to keyword in an website?
(node:1992) UnhandledPromiseRejectionWarning: TimeoutError: waiting for selector ".summary" failed: timeout 30000ms exceeded
at new WaitTask (c:\Users\WCS\node_modules\puppeteer\lib\FrameManager.js:862:28)
at Frame._waitForSelectorOrXPath (c:\Users\WCS\node_modules\puppeteer\lib\FrameManager.js:753:12)
at Frame.waitForSelector (c:\Users\WCS\node_modules\puppeteer\lib\FrameManager.js:711:17)
at Page.waitForSelector (c:\Users\WCS\node_modules\puppeteer\lib\Page.js:1043:29)
at main (c:\Users\WCS\scrape.js:15:18)
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:189:7)
(node:1992) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 2)
您可以看到我在错误范围内得到了结果。
答案 0 :(得分:2)
有两种方法可以解决您的问题:
首先:创建要遍历的URL数组,然后重用page
来访问它们。
const puppeteer = require("puppeteer");
(async function main() {
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&pageSize=50", {waitUntil: 'networkidle2'});
await page.waitForSelector(".summary");
const urls = await page.$$eval(".question-hyperlink", items => items.map(item => item.href));
console.log(urls);
for (let url of urls)
{
await page.goto(url);
await page.waitForSelector("#question-header");
let title = await page.$eval("#question-header a", item => item.textContent);
console.log(title);
}
await browser.close();
})();
第二:如Romain建议的那样,创建另一个页面并使用它来遍历页面。
这是已实施方法2的脚本副本,还纠正了其他几个问题(缺少await
运算符,问题页面上的选择器不正确)
const puppeteer = require("puppeteer");
(async function main() {
const browser = await puppeteer.launch({headless:false});
const page = await browser.newPage();
const newPage = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&pageSize=50", {waitUntil: 'networkidle2'});
await page.waitForSelector(".summary");
const sections = await page.$$(".summary");
for (const section of sections) {
let itemURL = await section.$eval(".question-hyperlink", item => item.href);
await newPage.goto(itemURL);
await newPage.waitForSelector("#question-header"); // <-- was ".summary"
let titles = await newPage.$$("#question-header");
for (let title of titles) {
let itmName = await title.$eval("#question-header .question-hyperlink", itm => itm.innerText);
console.log(itmName);
}
}
await browser.close();
})();
答案 1 :(得分:0)
我没有重放场景,但是您的两个错误来自:
await
前面有两个失踪的page.waitForSelector(".summary");
page.goto()
离开上下文,然后尝试评估section
对象上的某些内容,而这些内容已不再是DOM的一部分。要解决第一个问题,只需添加两个缺失项await
。
要解决第二个问题,请使用let newPage = await browser.newPage()
和newPage.goto('whereveryouwanttogo.com')
打开一个新页面。这样,您就不会破坏原始的page
,仍然可以做section
的事情。