我正在尝试使用apify sdk抓取页面的内容。 以下代码也可以很好地工作。但是我如何像puppeteer.launch({headless:true})一样强制使用Apify SDK的无头模式?
代码供您参考:
async function scrape(number) {
let output = { links: [], title: [], content: [] };
const URL = "https://somepage/";
process.env.APIFY_LOCAL_STORAGE_DIR = '/someappfolder/apify_storage/run_' + number;
const requestQueue = await Apify.openRequestQueue(number);
await requestQueue.addRequest({ url: URL });
const pseudoUrls = [new Apify.PseudoUrl(URL + "[.*]")];
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ request, page }) => {
output.links.push(request.url);
output.title.push(await page.title());
output.content.push((await page.content()).length);
var save = { url: request.url, title: await page.title(), content: (await page.content()).length };
//sendToAirtable(save);
console.log(`URL: ${request.url}`);
await Apify.utils.enqueueLinks({
page,
selector: 'a',
pseudoUrls,
requestQueue,
});
},
maxRequestsPerCrawl: 10,
maxConcurrency: 10,
minConcurrency: 2,
});
await crawler.run();
return output;
};
答案 0 :(得分:2)
添加与launchPuppeteerOptions: { headless: true }
相同级别的requestQueue
https://sdk.apify.com/docs/typedefs/launch-puppeteer-options#docsNav
答案 1 :(得分:0)
process.env.APIFY_HEADLESS = 1;
经过数小时的搜寻后,我跌跌撞撞地回答了... https://sdk.apify.com/docs/guides/environment-variables#apify_headless
答案 2 :(得分:0)
您可以像这样将无头选项添加到launchPuppeteerOptions
:
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
launchPuppeteerOptions: {
headless: true,
ignoreHTTPSErrors: true,
// slowMo: 500,
},
maxRequestsPerCrawl: settings.maxurls,
maxConcurrency: settings.maxcrawlers,