我一直在研究刮板功能。
现在关于应用程序:有两个抓取工具,可以从两个页面抓取有关公寓的数据。现在,当数据完成抓取时,它已传递给 mergeData
函数,其目标是将从抓取器传递的所有对象数组合并到一个包含抓取器中所有对象的单个数组中,然后成为传递给插入函数,插入到数据库中。
现在这是其中一个刮刀
const data_functions = require('../data-functions/data-functions');
const axios = require('axios'); //npm package - promise based http client
const cheerio = require('cheerio'); //npm package - used for web-scraping in server-side implementations
//santScaper function which as paramater needs count which is sent in the scraping-service file.
exports.santScraper = async (count) => {
const url = `https://www.sant.ba/pretraga/prodaja-1/tip-2/cijena_min-20000/stranica-${count}`;
const santScrapedData = [];
try {
load_url(url, santScrapedData);
} catch (error) {
console.log(error);
}
};
//Function that does loading URL part of the scraper, and starting of process for fetching raw data.
const load_url = async (url, santScrapedData) => {
await axios.get(url).then((response) => {
const $ = cheerio.load(response.data);
fetch_raw_html($).each((index, element) => {
process_single_article($, index, element, santScrapedData);
});
data_functions.mergeData(santScrapedData); <- here is data passed into the mergeData component
});
};
//Part where raw html data is fetched but in div that we want.
const fetch_raw_html = ($) => {
return $('div[class="col-xxs-12 col-xss-6 col-xs-6 col-sm-6 col-lg-4"]');
};
//Here is all logic for getting data that we want, from the raw html.
const process_single_article = ($, index, element, santScrapedData) => {
const getLink = $(element).find('a[class="re-image"]').attr('href');
const getDescription = $(element).find('a[class="title"]').text();
const getPrice = $(element)
.find('div[class="prices"] > h3[class="price"]')
.text()
.replace(/\.| ?KM$/g, '')
.replace(',', '.');
const getPicture = $(element).find('img').attr('data-original');
const getSquaremeters = $(element)
.find('span[class="infoCount"]')
.first()
.text()
.replace(',', '.')
.split('m')[0];
const pricepersquaremeter =
parseFloat(getPrice) / parseFloat(getSquaremeters);
santScrapedData[index] = {
id: getLink.substring(42, 46),
link: getLink,
description: getDescription,
price: Math.round(getPrice),
picture: getPicture,
squaremeters: Math.round(getSquaremeters),
pricepersquaremeter: Math.round(pricepersquaremeter),
};
};
现在是 mergeData
组件
let mergedApartments = [];
exports.mergeData = async (apartments) => {
//Fetching all apartments that are passed from scraper(s)
mergedApartments = mergedApartments.concat(apartments);
//Sending data for validation to the validation function
return mergedApartments;
};
this.mergeData().then((result) => console.log(result));
现在返回 [undefined]
,我希望在其中填充来自两个刮刀的对象。
现在我想要的是:第一个抓取器返回数组中的 ~9 个对象,而第二个抓取器返回数组中的 ~30 个对象。现在我希望进入函数的这两个数组合并并作为一个数组传递给对象(所以 mergedApartments.length
将是 ~39。我无法实现这一点,我尝试了很多解决方案其他问题,但没有成功。
正确的解决方案是什么?谢谢!
答案 0 :(得分:1)
// Your data module:
const data_functions = {
mergedApartments: [],
mergeData(apartments){ this.mergedApartments.push(...apartments) },
};
// Scrape mock: async function, returning array of data
const load_url = () => new Promise(resolve => setTimeout(() => resolve([1,2,3,4]), 2000));
// scraper 1:
const santScraper = async () => {
const data = await load_url(); // Imagine here all operations for scraping data till you have array..
data_functions.mergeData(data);
return data;
};
// scraper 2:
const anotherScraper = async () => {
const data = await load_url(); // Imagine here all operations for scraping data till you have array..
data_functions.mergeData(data);
return data;
};
// Call both scrapers, await them and get all in one array:
Promise.all([santScraper(), anotherScraper()])
.then(() => console.log(data_functions.mergedApartments))