我正在构建一个小刮刀,它会搜索链接的搜索结果页面,然后单击每个链接以从结果页面中删除详细信息。所以到目前为止我有两个刮刀。一个擦除结果页面,另一个擦除单个结果页面。这是结果页面的截断刮刀:
const puppeteer = require('puppeteer');
var URLList = new Array;
let scrapeResults = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.******.com/search_result');
await page.waitFor(1000);
const RESULT_SELECTOR ='#innerLeft ';
const RESULT_CLASS = 'dspListings2';
// scrape result page for URLs and put them in global URLList for further processing
URLList.push(results);
browser.close();
};
scrapeResults();
这是单个结果页面的刮刀(链接点击后):
var details=''; //to be populated by scrapeListings function
const puppeteer = require('puppeteer');
URLList = [url1, url2, url3] // URLList is populated by the scrapeResults() function
URLList.forEach(async (url) => {
const scrapeResultDetails = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await page.waitFor(1000);
const RESULT_DETAILS_SELECTOR = '#details_layout > p';
// scrape for result details
// assign result details to global details variable for further processing
details = resultDetails;
browser.close();
};
scrapeResultDetails();
});
结果页面返回一个URL列表,然后我将其传递给第二个scraper,以便forEach
循环打开列表中的每个URL以获取详细信息。
问题
问题是我无法调用第二个刮刀,因为它在第一个刮刀内。两者都有async wait
,这会导致错误。例如,这是我尝试过的,它不起作用:
const puppeteer = require('puppeteer');
var URLList = new Array;
var details=''; //to be populated by scrapeListings function
let scrapeResults = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.******.com/search_result');
await page.waitFor(1000);
const RESULT_SELECTOR ='#innerLeft ';
const RESULT_CLASS = 'dspListings2';
// scrape result page for URLs and put them in global URLList for further processing
URLList.push(results);
browser.close();
URLList.forEach(async (url) => {
const scrapeResultDetails = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await page.waitFor(1000);
const RESULT_DETAILS_SELECTOR = '#details_layout > p';
// scrape for result details
// assign result details to global details variable for further processing
details = resultDetails;
browser.close();
};
scrapeResultDetails();
});
};
scrapeResults();
任何想法??? 另外,我应该在哪里声明循环的全局变量?
答案 0 :(得分:0)
您需要切换到[for-of][1]
而不是.forEach
循环,因为它非常适合异步调用。
另外,您还错过了几条await
语句。
请查看我的评论:
const puppeteer = require('puppeteer');
var URLList = [];
var details=''; //to be populated by scrapeListings function
const scrapeResultDetails = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
await page.waitFor(1000);
const RESULT_DETAILS_SELECTOR = '#details_layout > p';
//TODO: Global variables are bad, consider returning details from a function.
details = resultDetails;
//TODO: `await` was missing here
await browser.close();
};
let scrapeResults = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.******.com/search_result');
await page.waitFor(1000);
const RESULT_SELECTOR ='#innerLeft ';
const RESULT_CLASS = 'dspListings2';
URLList.push(results);
// TODO: `await` has been missing.
await browser.close();
// TODO: Please use for-of loop here, you won't have any async prolems then
for (let url of URLList) {
// TODO: `details` is going to be populated after each iterration.
// TODO: Although consider having `const details = await scrapeResultDetails(); here.
await scrapeResultDetails();
}
};