用于抓取的无头chrome nodejs异步循环

时间:2017-11-21 02:36:39

标签: javascript node.js google-chrome asynchronous google-chrome-headless

我正在尝试使用nodejs以及我们现在拥有无头镀铬浏览器并与之交互的方式,这非常棒!

我有一些代码,而且我正在努力抓取1个网站而没有任何问题。然而,当我想刮掉多个我的循环似乎搞砸了,我很确定这与async / await有关。

我的循环接近此代码的底部 - 是否有人有任何建议?

谢谢堆!

const HeadlessChrome = require('simple-headless-chrome')

const browser = new HeadlessChrome({
  headless: true, // If you turn this off, you can actually see the browser navigate with your instructions,
})

async function navigateWebsite(urlToGoTo) {
  try {
    await browser.init()

    const mainTab = await browser.newTab({
      privateTab: false
    })

    await mainTab.inject('jquery')

    let cookieName = 'li_at'
    let cookieValue = 'cyzzzzzzzzz'
    let cookieDomain = '.www.linkedin.com'

    await mainTab.setCookie(cookieName, cookieValue, {
      domain: cookieDomain
    })

    // Navigate to a URL
    await mainTab.goTo(urlToGoTo)
    await mainTab.wait(2000);

    // Get a HTML tag value based on class id
    let businessName = await mainTab.evaluate(function (selector) {
      const selectorHtml = document.querySelector(selector)
      return selectorHtml.innerHTML
    }, '.org-top-card-module__name');

    let industry = await mainTab.evaluate(function (selector) {
      const selectorHtml = document.querySelector(selector)
      return selectorHtml.innerHTML
    }, '.company-industries');

    let followers = await mainTab.evaluate(function (selector) {
      const selectorHtml = document.querySelector(selector)
      return selectorHtml.innerHTML
    }, '.org-top-card-module__followers-count');

    let details = {
      businessName: cleanData(businessName),
      industry: cleanData(industry),
      followers: cleanData(followers)
    }

    console.log(details)


    // Resize the viewport to full screen size (One use is to take full size screen shots)
    await mainTab.resizeFullScreen()

    // Take a screenshot
    await mainTab.saveScreenshot()

    // Close the browser
    await browser.close()

  } catch (err) {
    console.log('ERROR!', err)
  }
}

let websites = []

websites.push('https://www.linkedin.com/company/qrious-limited/')
websites.push('https://www.linkedin.com/company/wentworth-consulting-nz-/')
websites.push('https://www.linkedin.com/company/capita/')

websites.forEach(function (i) {
   navigateWebsite(i)
})


function cleanData(a) {
  return a.result.value.replace(/(\r\n|\n|\r)/gm, "").trim()
}

1 个答案:

答案 0 :(得分:1)

Promise.all()是异步的,但不等待它。您可以使用await将您的网站列表映射到导航功能,或确保Promise.all(websites.map(w => navigateWebsite(w))); // or for (let w of websites) { await navigateWebsite(w); } 每个结果。

{{1}}
相关问题