我正在从网页上抓取数据,分页也有效。 我有一张桌子。 对于每个tr,都有位置,标题和URL。
在函数.map((index,element)=> {我想为每个tr调用“ async url => {await page.goto(${url}
);”转到详细信息,从详细信息中抓取数据页面并返回列表。
我该怎么做? (在.map()中调用异步)
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
async function scrapeListings(page) {
await page.goto(
"https://www.example.com/podcast-detail/nth28-2ef41/99%25-Invisible-
Podcast",
{ waitUntil: ["load", "domcontentloaded", "networkidle0"] }
);
let moreResults = true;
while (moreResults) {
console.log(moreResults);
try {
const html = await page.content();
const $ = cheerio.load(html);
const listings = $("#yw0 > table > tbody > tr")
.map((index, element) => {
const position = $(element)
.find("td.id.tc")
.text();
const title = $(element)
.find("a.title.listen-now")
.text();
const url = $(element)
.find("a")
.attr("href");
const datetime = $(element)
.find("span.datetime")
.text();
async url => {
await page.goto(`${url}`);
//Open URL for details
//SCRAPE Details
//return value and store to listing
};
return { position, title, datetime, url };
})
.get();
await sleep(1000);
element = await page.$x(`//a[contains(text(),'→')]`);
await element[0].click();
console.log(listings);
//await page.waitForNavigation();
} catch (error) {
moreResults = false;
}
}
}
async function sleep(miliseconds) {
return new Promise(resolve => setTimeout(resolve, miliseconds));
}
async function main() {
//await connectToMongoDb();
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1057 });
listings = await scrapeListings(page);
console.log(listings);
//await browser.close();
}
main();
答案 0 :(得分:0)
我尝试将您的脚本重新编码为仅使用Puppeteer。无需加油。
spring.transaction.default-timeout
答案 1 :(得分:0)
如果有人需要单击链接并打开新页面而不是调用url,则此代码为: 也许有些人有更好的解决方案,可以自由分享,谢谢
const getPodcast = async url => {
const pageTarget = page.target();
await page.click(".table > .items > tr > td > .title");
const newTarget = await browser.waitForTarget(
target => target.opener() === pageTarget
);
const pagePod = await newTarget.page();
console.log("current page count ", (await browser.pages()).length);
//const pagePod = await browser.newPage();
//const openPod = await pagePod.goto(url, { waitUntil: `networkidle0`,
timeout: 0 });