正在使用cheerio和nodejs从allegro网站获取数据以在API中创建终结点,该API向csv数据返回此数据,稍后将作为数据科学项目的一部分进行研究:
要获取汽车信息,我设法从首页上抓取了每个链接,这些链接会将您发送到汽车(汽车项目)以查看汽车的完整信息,我需要从每个链接中抓取更多数据我这样做吗?
以及如何使json数据显示为csv?
此处使用的代码:
const url =
"https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605";
//const writeStream = fs.createWriteStream("allegro.csv");
// Write Headers
//writeStream.write(`Price,Link \n`);
function getCars() {
return fetch(`${url}`)
.then((response) => response.text())
.then((body) => {
const cars = [];
const $ = cheerio.load(body);
$("._9c44d_2H7Kt").each(function (i, el) {
const $price = $(el).find("._9c44d_1zemI");
const $link = $(el).find("a");
const $year = $(el).find("dd");
const $make = $(el).find("h2");
const car = {
price: $price.text().replace(/\s\s+/g, ""),
link: $link.attr("href"),
year: $year.first().next().next().text(),
make: $make.text(),
};
cars.push(car);
});
// Write Row to CSV
// writeStream.write(`${price},${link} \n`);
return cars;
});
}
用于nodejs端点的代码:
app.get("/scraping/:allegro", (req, res) => {
scraper.getCars(req.param.allegro).then((cars) => {
//console.log(cars);
res.json(cars);
});
从每个链接获取的数据如下:添加日期,型号,电话号码,城市,vin
答案 0 :(得分:1)
这些页面有一个方便的地方,就是您可以通过将媒体类型设置为application/json
(例如,设置Accept
标头)来以JSON而不是html的形式返回数据。
例如获取列表:
curl "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605&order=dd" \
-H "Accept: application/json"
要获取特定项目:
curl "https://allegro.pl/ogloszenie/mercedes-ml320-9341716141" -H "Accept: application/json"
因此,您不必仅使用解析JSON的网络抓取工具。通过添加也很方便的查询参数&p=PAGE_NUM
完成分页
我在python中做了一个小例子,可以很容易地移植到JS。它请求汽车列表,然后请求第一个元素:
import requests
import json
import pandas as pd
r = requests.get("https://allegro.pl/kategoria/samochody-osobowe-4029",
headers = {
"Accept": "application/json"
},
params = {
"bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
"order":"dd"
})
data = [{
"name": t["name"],
"url": t["url"],
"price": t["sellingMode"]["advertisement"]["price"]["amount"],
**dict([(j["name"],j["values"][0]) for j in t["parameters"]]),
}
for t in r.json()["pagination bottom"]["collection"]["items"]["promoted"]
]
df = pd.DataFrame(data)
print(df)
print("get data for first element")
r = requests.get(data[0]["url"],
headers = {
"Accept": "application/json"
})
item = r.json()
item_data = {
"phone": item["summary"]["offer"]["contact"]["phones"][0]["number"],
"delivery": item["summary"]["offer"]["delivery"]["summary"][0]["value"]["text"],
"startingAt": item["summary"]["offer"]["publication"]["startingAt"],
"endingAt": item["summary"]["offer"]["publication"]["endingAt"],
**dict([(j["name"], j["values"][0]["valueLabel"]) for j in item["summary"]["offer"]["parametersGroups"]["groups"][0]["parameters"]])
}
print(item_data)
nodejs中使用axios的实现:
const axios = require("axios");
async function process() {
let response = await axios.get('https://allegro.pl/kategoria/samochody-osobowe-4029',{
query: {
"bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
"order":"dd"
},
responseType: "json"
});
let promoted = response.data["pagination bottom"].collection.items.promoted;
list = [];
for (var i = 0; i < promoted.length;i++) {
let item = {
name: promoted[i].name,
url: promoted[i].url,
price: promoted[i].sellingMode.advertisement.price.amount,
};
let params = promoted[i].parameters;
for (var j = 0; j < params.length;j++){
item[params[j].name] = params[j].values[0];
}
list.push(item);
}
console.log(list);
console.log("fetching : " + list[0].url);
response = await axios.get(list[0].url,{
responseType: "json"
});
let entryData = response.data;
let entry = {
phone: entryData.summary.offer.contact.phones[0].number,
delivery: entryData.summary.offer.delivery.summary[0].value.text,
startingAt: entryData.summary.offer.publication.startingAt,
endingAt: entryData.summary.offer.publication.endingAt
};
let parameters = entryData.summary.offer.parametersGroups.groups[0].parameters;
for (var i = 0; i < parameters.length;i++) {
entry[parameters[i].name] = parameters[i].values[0].valueLabel
}
console.log(entry);
}
process();