网络实时抓取流聊天 (puppeteer.js)

时间:2021-06-24 23:23:08

标签: javascript node.js web-scraping puppeteer

我想通过网络抓取实时从流中获取聊天内容。

试图在 puppeeter 的 .then() 函数内部创建一个 while 循环似乎并不有效,并且在某些实现中将其全部分解。

我能够进行初始抓取,但在所有情况下,程序都会结束并且不想遵循我实现的 while 循环。

没有while循环的工作代码

const puppeteer = require ('puppeteer');

//initiating Puppeteer
puppeteer
  .launch ()
  .then (async browser => {
    //opening a new page and navigating to the live stream
    const page = await browser.newPage ();
    await page.goto ('https://www.younow.com/Ken_Nara24');
    await page.waitForSelector ('body');
  
    //manipulating the page's content
    let getComments = await page.evaluate (() => {
    let comments = document.body.querySelectorAll ('.comment');
    let scrapeItems = [];

    

    comments.forEach (item => {
        let commentAuthor = item.querySelector ('div.user-card__header.mini-profile-launcher').innerText;
        let commentContent = '';
            try {
            commentContent = item.querySelector ('div.user-card__body.ng-star-inserted').innerText;
            } catch (err) {}
            scrapeItems.push ({
            commentAuthor: commentAuthor,
            commentContent: commentContent,
            });
        });
    
    
    let items = {
        "userComments": scrapeItems,
    };
    return items;
        
    });
    //outputting the scraped data
    console.log (getComments);
    //closing the browser
    await browser.close ();
  })
  //handling any errors
  .catch (function (err) {
    console.error (err);
  });

所有试图实现逻辑循环的尝试都是徒劳的。我找不到一种方法或过去的问题/例子来清楚地定义如何或是否可以完成这样的事情。我自己做了几次尝试来实现它,但都没有正确编译。

我在这里遗漏了什么重要的东西吗?我只想听一个网页,每 3-5 秒重新抓取一次。

1 个答案:

答案 0 :(得分:2)

如果您仍然需要帮助,可以尝试这种方式。

const puppeteer = require("puppeteer");
let pageScraping = false; /* set scraping to false */

const scraper = async () => {
  if (pageScraping == true) return; /* check if already scraping page */
  let browser, page;
  let pageUrl = 'https://www.younow.com/Ken_Nara24';

  try {
    pageScraping = true; /* set scraping to true */
    browser = await puppeteer.launch({ headless: true });
    page = await browser.newPage();
    await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });

    /* wait for chat to be visible */
    await page.waitForSelector('.chat', { visible: true, timeout: 60000 });

    let getComments = await page.evaluate(() => {
      let scrapeComments = [];
      let comments = document.querySelectorAll('.comment');

      comments.forEach(comment => {
        let commentContent = '';
        let commentAuthor = comment.querySelector('div[class="user-card__header mini-profile-launcher"]').innerText;
        commentContent = comment.querySelector('div[class="user-card__body ng-star-inserted"]').innerText;

        scrapeComments.push({
          'commentAuthor': commentAuthor,
          'commentContent': commentContent,
        });
      });

      return { 'userComments': scrapeComments };
    });

    console.log(await getComments); /* log comments */
  } catch (err) {
    console.log(err.message);
  } finally {
    if (browser) { /* check if browser is open befor trying to close */
      await browser.close();
      console.log('closing browser');
    }
    pageScraping = false; /* set scraping to false again */
    await setTimeout(scraper, 5000); /* wait 5 seconds befor re-scraping */
  }
}

setTimeout(scraper, 5000); /* start scraping */
相关问题