木偶-JQuery选择器在某些站点上找不到元素?

时间:2019-07-02 00:29:30

标签: javascript jquery html web-scraping jquery-selectors

最近,我一直在与puppeteer一起玩耍,最终目的是学习如何刮擦Glassdoor进行公司评论,and我遇到了一个奇怪的问题。由于某种原因,尽管我的脚本的抓取逻辑在另一个站点上运行正常,但我的JQuery(从此处安装)选择器无法找到<div class="hreview">的控股公司评论。

我首先在黑客新闻上测试了我的脚本,并成功打印了每篇文章的编号和标题:

const puppeteer = require("puppeteer");
const fs = require('fs');

const $ = require('cheerio');

// Function to help with cheerio's JQuery down the line
function jq_helper(current_selector, child_element_name) {
  if (!$(current_selector).children(child_element_name).length) {
    console.log(`${child_element_name} does not exist`);
    return [];
  }

  else {
    var scraped_data = $(current_selector).children(child_element_name).text();
    console.log(scraped_data);
    return scraped_data;
  }
}


// puppeteer usage as normal
puppeteer.launch({ headless: false }).then(async browser => {

  const page = await browser.newPage()

  const navigationPromise = page.waitForNavigation()

  await page.setViewport({ width: 1440, height: 794 })  
  await page.goto('https://news.ycombinator.com/news')

  await navigationPromise

  var data = [];

  const html = await page.content();

  $(".athing", html).each(function() {

  console.log("\nMain scraping function underway.")

  // Getting number of each link
  var element1 = ".title[valign='top'][align='right']"
  var number = jq_helper(this, element1)
  data.push({'number': number})

  // Traversing to next level of descendants - getting title and link of each article
  current_level = $(this).children(".title:not([valign])")

  // Getting title and link of article
  var element2 = ".storylink"
  link = jq_helper(current_level, element2)
  data.push({'link': link})


         });


  fs.writeFile('scrape_hackernews.txt', JSON.stringify(data), (err) => {  
    // throws an error, you could also catch it here
    if (err) throw err;

    // success case, the file was saved
    console.log('\nFile saved!');
});


    await browser.close()

})

但是,以下脚本中的glassdoor导致上述抓取逻辑失败,并且每个选定的html元素均报告为丢失:

const puppeteer = require("puppeteer");
const $ = require('cheerio');

const fs = require('fs');

// Function to help with cheerio's JQuery down the line
function jq_helper(current_selector, child_element_name) {
  if (!$(current_selector).children(child_element_name).length) {
    console.log(`${child_element_name} does not exist`);
    return [];
  }

  else {
    var scraped_data = $(current_selector).children(child_element_name).text();
    console.log(scraped_data);
    return scraped_data;  
  }
}


// puppeteer usage as normal
puppeteer.launch({ headless: false }).then(async browser => {

  const page = await browser.newPage()

  const navigationPromise = page.waitForNavigation()

  await page.setViewport({ width: 1440, height: 794 })  
  await page.goto('https://www.glassdoor.com/Reviews/Grubhub-Reviews-E419089.htm')

  await navigationPromise

  var data = [];


  const html = await page.content();
  $(".hreview", html).each(function() {

    console.log("\nMain scraping function happening.")

    // Getting date of review
    var element1 = ".d-flex justify-content-between"
    var date = jq_helper(this, element1);

    data.push({'date': date})

    // Traversing next level
    current_level = $(this).children(".row mt").children(".col-sm-11 pl-sm-lg mx-0")

    // Getting title
    var element2 = ".h2 summary strong mt-0 mb-xsm"
    var title = jq_helper(current_level, element2)

    data.push({'title': title})

    // Getting stars
    var element3 = ".mr-xsm d-lg-inline-block"
    var stars = jq_helper(current_level, element3)

    data.push({'stars': stars})

    // Getting employee info
    var element4 = ".d-lg-inline-block"
    var employee_info = jq_helper(current_level, element4)

    data.push({'employee_info': employee_info})

    // Traversing next level
    current_level = $(current_level).children("div")

    // Getting review sub-title
    var element5 = ".mainText mb-0"
    var title = jq_helper(current_level, element5)

    data.push({'title': title})

    // Traversing next level
    current_level = $(current_level).children(".description")

    // Getting all pieces of review
    var element6 = ".mt-md"
    var review = jq_helper(current_level, element6)

    data.push({'review': review})

  });


  fs.writeFile('scrape_example.txt', JSON.stringify(data), (err) => {  
    // throws an error, you could also catch it here
    if (err) throw err;

    // success case, the file was saved
    console.log('\nFile saved!');
});

  await browser.close()

})

有人可以让我开始,以便我的脚本可以找到它要查找的第一个元素,然后我可以从那里去吗?我不知道此错误来自哪里。

提前谢谢您。

0 个答案:

没有答案