我试图通过加载更多按钮来抓取一个网站,但我无法在噩梦中执行递归功能。我的代码是这样的:
const Nightmare = require('nightmare');
const nightmare = Nightmare({
show:true
});// }
const request = require('request');
const cheerio = require('cheerio');
let url = 'https://www.housers.com/es/proyectos/avanzado';
let propertyArray = [];
var getThePage = function() {
nightmare
.goto('https://www.housers.com/es/proyectos/avanzado')
.wait(1500)
.click('#loadMore')
.evaluate(() =>{
return document.querySelector('.all-info').innerHTML;
})
.end()
.then((result) => {
let $ = cheerio.load(result);
let loadMore = $('#loadMore')
if (loadMore) {
getThePage();
}
return result
})
.catch((error) => {
console.error('Search failed:', error);
});
}
getThePage()
我不知道你是否有办法通过这种方法或任何其他想法来做到这一点
答案 0 :(得分:0)
如果要废弃表格中的数据,则无需使用噩梦。在网络选项卡中,您会看到它调用此端点:
https://www.housers.com/es/proyectos/avanzado/scroll
有一些分页&页面大小,让我们每页200(不知道它是否高于限制)。
然后你只需要解析html&将数据放入数组:
const axios = require('axios');
const querystring = require('querystring');
const cheerio = require('cheerio');
const entities = require("entities");
const url = 'https://www.housers.com/es/proyectos/avanzado/scroll';
const prices = [];
function doRequest(url, page){
return axios.post(url + '?page=' + page + '&size=200', querystring.stringify({
word: "",
country: "",
type: "",
order: "STOCK_PRICE_VARIATION",
orderDirection: "DESC"
}));
}
async function getPrices() {
var empty = false;
var page = 0;
while (!empty) {
//call API
console.log("GET page n°" + page);
var res = await doRequest(url, page);
page++;
//parse HTML
const $ = cheerio.load(res.data,{
xmlMode: true,
normalizeWhitespace: true,
decodeEntities: true
});
if (res.data.trim() !== ""){
//extract prices : put it in array
$('tr').map(function(){
var obj = [];
$(this).children('td').map(function(){
obj.push(entities.decodeHTML($(this).text().trim()));
});
prices.push(obj);
});
}
else {
empty = true;
}
}
console.log(prices);
console.log("total length : " + prices.length);
}
getPrices();