最近,我一直在与puppeteer
一起玩耍,最终目的是学习如何刮擦Glassdoor进行公司评论,and我遇到了一个奇怪的问题。由于某种原因,尽管我的脚本的抓取逻辑在另一个站点上运行正常,但我的JQuery(从此处安装)选择器无法找到<div class="hreview">
的控股公司评论。
我首先在黑客新闻上测试了我的脚本,并成功打印了每篇文章的编号和标题:
const puppeteer = require("puppeteer");
const fs = require('fs');
const $ = require('cheerio');
// Function to help with cheerio's JQuery down the line
function jq_helper(current_selector, child_element_name) {
if (!$(current_selector).children(child_element_name).length) {
console.log(`${child_element_name} does not exist`);
return [];
}
else {
var scraped_data = $(current_selector).children(child_element_name).text();
console.log(scraped_data);
return scraped_data;
}
}
// puppeteer usage as normal
puppeteer.launch({ headless: false }).then(async browser => {
const page = await browser.newPage()
const navigationPromise = page.waitForNavigation()
await page.setViewport({ width: 1440, height: 794 })
await page.goto('https://news.ycombinator.com/news')
await navigationPromise
var data = [];
const html = await page.content();
$(".athing", html).each(function() {
console.log("\nMain scraping function underway.")
// Getting number of each link
var element1 = ".title[valign='top'][align='right']"
var number = jq_helper(this, element1)
data.push({'number': number})
// Traversing to next level of descendants - getting title and link of each article
current_level = $(this).children(".title:not([valign])")
// Getting title and link of article
var element2 = ".storylink"
link = jq_helper(current_level, element2)
data.push({'link': link})
});
fs.writeFile('scrape_hackernews.txt', JSON.stringify(data), (err) => {
// throws an error, you could also catch it here
if (err) throw err;
// success case, the file was saved
console.log('\nFile saved!');
});
await browser.close()
})
但是,以下脚本中的glassdoor导致上述抓取逻辑失败,并且每个选定的html元素均报告为丢失:
const puppeteer = require("puppeteer");
const $ = require('cheerio');
const fs = require('fs');
// Function to help with cheerio's JQuery down the line
function jq_helper(current_selector, child_element_name) {
if (!$(current_selector).children(child_element_name).length) {
console.log(`${child_element_name} does not exist`);
return [];
}
else {
var scraped_data = $(current_selector).children(child_element_name).text();
console.log(scraped_data);
return scraped_data;
}
}
// puppeteer usage as normal
puppeteer.launch({ headless: false }).then(async browser => {
const page = await browser.newPage()
const navigationPromise = page.waitForNavigation()
await page.setViewport({ width: 1440, height: 794 })
await page.goto('https://www.glassdoor.com/Reviews/Grubhub-Reviews-E419089.htm')
await navigationPromise
var data = [];
const html = await page.content();
$(".hreview", html).each(function() {
console.log("\nMain scraping function happening.")
// Getting date of review
var element1 = ".d-flex justify-content-between"
var date = jq_helper(this, element1);
data.push({'date': date})
// Traversing next level
current_level = $(this).children(".row mt").children(".col-sm-11 pl-sm-lg mx-0")
// Getting title
var element2 = ".h2 summary strong mt-0 mb-xsm"
var title = jq_helper(current_level, element2)
data.push({'title': title})
// Getting stars
var element3 = ".mr-xsm d-lg-inline-block"
var stars = jq_helper(current_level, element3)
data.push({'stars': stars})
// Getting employee info
var element4 = ".d-lg-inline-block"
var employee_info = jq_helper(current_level, element4)
data.push({'employee_info': employee_info})
// Traversing next level
current_level = $(current_level).children("div")
// Getting review sub-title
var element5 = ".mainText mb-0"
var title = jq_helper(current_level, element5)
data.push({'title': title})
// Traversing next level
current_level = $(current_level).children(".description")
// Getting all pieces of review
var element6 = ".mt-md"
var review = jq_helper(current_level, element6)
data.push({'review': review})
});
fs.writeFile('scrape_example.txt', JSON.stringify(data), (err) => {
// throws an error, you could also catch it here
if (err) throw err;
// success case, the file was saved
console.log('\nFile saved!');
});
await browser.close()
})
有人可以让我开始,以便我的脚本可以找到它要查找的第一个元素,然后我可以从那里去吗?我不知道此错误来自哪里。
提前谢谢您。