我正在尝试Puppeter。我可以读取标签等之间的数据。现在,我试图了解是否也可以抓取内联元素,例如div的示例是:
<div class="distance-info-text distance font-xs-light" data-lat="50.912758301334" data-lng="6.019417197437" data-nid="153">83.8 km</div>
例如,现在我想要内联元素:
data-lat = "50.912758301334" data-lng = "6.019417197437"
想要抓取,Puppeteer是否可以?
我的职位名称如下:
const puppeteer = require('puppeteer');
const fs = require('fs');
const getParagraphs = async (url) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url)
var movies = await page.evaluate(() => {
var titlesList = document.querySelectorAll('distance-info-text');
var movieArr = [];
for (var i = 0; i < titlesList.length; i++) {
movieArr[i] = {
title: titlesList[i].innerText.trim(),
summary: titlesList[i].nextElementSibling.innerText.trim()
};
}
return movieArr;
})
fs.writeFile("./netflixscrape.json", JSON.stringify(movies, null, 3), (err) => {
if (err) {
console.error(err);
return;
};
console.log("Great Success");
})
browser.close();
};
module.exports = getParagraphs;
答案 0 :(得分:0)
是的,您可以使用dataset属性轻松做到这一点:
var movies = await page.evaluate(() => {
var titlesList = document.querySelectorAll('distance-info-text');
var movieArr = [];
for (var i = 0; i < titlesList.length; i++) {
movieArr[i] = {
title: titlesList[i].innerText.trim(),
summary: titlesList[i].nextElementSibling.innerText.trim(),
lat: titlesList[i].dataset.lat, // <-- using dataset property
lng: titlesList[i].dataset.lng, // <-- using dataset property
};
}
return movieArr;
})