如何抓取网址

时间:2019-08-09 00:35:35

标签: node.js puppeteer

我要抓取多个网址

const puppeteer = require('puppeteer');

let scrape = async () => {
    const browser = await puppeteer.launch({headless: false,userDataDir: "./user_data"});

   let elements = ['https://tr.pinterest.com/gamzeeerkek','https://tr.pinterest.com/jislaynekauany_']
   const result = await page.evaluate(() => {     

    for(let url of elements)
    {
        let page = await browser.newPage();
        await page.goto(url);
        await page.waitFor(1000);
            let title = document.querySelector('.lH1').innerText;
            let title1 = document.getElementsByClassName('tBJ')[1].innerText; 

            data.push({title, title1});

    }
    return data; // Return our data array
    });

    browser.close();
    return result; // Return the data

};

scrape().then((value) => {
    console.log(value); // Success!
});

我的错误是:

  

让页面=等待browser.newPage();                      ^^^^^

     

SyntaxError:等待仅在异步功能中有效

1 个答案:

答案 0 :(得分:0)

您看到此错误,是因为您在await之外使用async。另外,由于上下文和语法错误,即使您添加了async关键字,您也会遇到一些错误并且脚本将无法正常工作。

这是脚本:

const scrape = async () => {
const browser = await puppeteer.launch({headless: false,userDataDir: "./user_data"});

const data = [];
const urls = ['https://tr.pinterest.com/gamzeeerkek','https://tr.pinterest.com/jislaynekauany_']
const page = await browser.newPage();

for (const url of urls) {
  await page.goto(url);
  await page.waitForSelector('.lH1');

  const result = await page.evaluate(() => {
    const title = document.querySelector('.lH1').innerText;
    const title1 = document.getElementsByClassName('tBJ')[1].innerText; 

    return {title, title1};
  });

  data.push(result)
}
 await browser.close();

 return data; // Return the data
};

scrape().then((value) => {
 console.log(value); // Success!
});

输出:

  

[{title:'Gamze Erkek',title1:'Follow'},     {title:'Jislayne',title1:'Follow'}]