使用cheerio从使用cheerio提取的链接中抓取数据

时间:2020-06-08 21:11:03

标签: node.js api web-scraping cheerio

正在使用cheerio和nodejs从allegro网站获取数据以在API中创建终结点,该API向csv数据返回此数据,稍后将作为数据科学项目的一部分进行研究:

https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605

要获取汽车信息,我设法从首页上抓取了每个链接,这些链接会将您发送到汽车(汽车项目)以查看汽车的完整信息,我需要从每个链接中抓取更多数据我这样做吗?

以及如何使json数据显示为csv?

此处使用的代码:

const url =
  "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605";

//const writeStream = fs.createWriteStream("allegro.csv");
// Write Headers
//writeStream.write(`Price,Link \n`);

function getCars() {
  return fetch(`${url}`)
    .then((response) => response.text())
    .then((body) => {
      const cars = [];
      const $ = cheerio.load(body);
      $("._9c44d_2H7Kt").each(function (i, el) {
        const $price = $(el).find("._9c44d_1zemI");
        const $link = $(el).find("a");
        const $year = $(el).find("dd");
        const $make = $(el).find("h2");

        const car = {
          price: $price.text().replace(/\s\s+/g, ""),
          link: $link.attr("href"),
          year: $year.first().next().next().text(),
          make: $make.text(),
        };
        cars.push(car);
      });

      // Write Row to CSV
      // writeStream.write(`${price},${link} \n`);
      return cars;
    });
}

用于nodejs端点的代码:

app.get("/scraping/:allegro", (req, res) => {
  scraper.getCars(req.param.allegro).then((cars) => {
    //console.log(cars);
    res.json(cars);
  });

从每个链接获取的数据如下:添加日期,型号,电话号码,城市,vin

1 个答案:

答案 0 :(得分:1)

这些页面有一个方便的地方,就是您可以通过将媒体类型设置为application/json(例如,设置Accept标头)来以JSON而不是html的形式返回数据。

例如获取列表:

curl "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605&order=dd" \
     -H "Accept: application/json"

要获取特定项目:

curl "https://allegro.pl/ogloszenie/mercedes-ml320-9341716141" -H "Accept: application/json"

因此,您不必仅使用解析JSON的网络抓取工具。通过添加也很方便的查询参数&p=PAGE_NUM完成分页

我在中做了一个小例子,可以很容易地移植到JS。它请求汽车列表,然后请求第一个元素:

import requests 
import json
import pandas as pd

r = requests.get("https://allegro.pl/kategoria/samochody-osobowe-4029",
    headers = {
        "Accept": "application/json"
    },
    params = {
        "bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
        "order":"dd"
    })
data = [{
        "name": t["name"],
        "url": t["url"],
        "price": t["sellingMode"]["advertisement"]["price"]["amount"],
        **dict([(j["name"],j["values"][0]) for j in t["parameters"]]),
    }
    for t in r.json()["pagination bottom"]["collection"]["items"]["promoted"]
]
df = pd.DataFrame(data)
print(df)

print("get data for first element")
r = requests.get(data[0]["url"],
    headers = {
        "Accept": "application/json"
    })
item = r.json()
item_data = {
    "phone": item["summary"]["offer"]["contact"]["phones"][0]["number"],
    "delivery": item["summary"]["offer"]["delivery"]["summary"][0]["value"]["text"],
    "startingAt": item["summary"]["offer"]["publication"]["startingAt"],
    "endingAt": item["summary"]["offer"]["publication"]["endingAt"],
    **dict([(j["name"], j["values"][0]["valueLabel"]) for j in item["summary"]["offer"]["parametersGroups"]["groups"][0]["parameters"]])
}

print(item_data)

中使用axios的实现:

const axios = require("axios");

async function process() {
    let response = await axios.get('https://allegro.pl/kategoria/samochody-osobowe-4029',{
        query: {
            "bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
            "order":"dd"
        },
        responseType: "json"
    });
    let promoted = response.data["pagination bottom"].collection.items.promoted;
    list = [];
    for (var i = 0; i < promoted.length;i++) {
        let item = {
            name: promoted[i].name,
            url: promoted[i].url,
            price: promoted[i].sellingMode.advertisement.price.amount,
        };
        let params = promoted[i].parameters;
        for (var j = 0; j < params.length;j++){
            item[params[j].name] = params[j].values[0];
        }
        list.push(item);
    }
    console.log(list);
    console.log("fetching : " + list[0].url);
    response = await axios.get(list[0].url,{
        responseType: "json"
    });
    let entryData = response.data;
    let entry = {
        phone: entryData.summary.offer.contact.phones[0].number,
        delivery: entryData.summary.offer.delivery.summary[0].value.text,
        startingAt: entryData.summary.offer.publication.startingAt,
        endingAt: entryData.summary.offer.publication.endingAt
    };
    let parameters = entryData.summary.offer.parametersGroups.groups[0].parameters;
    for (var i = 0; i < parameters.length;i++) {
        entry[parameters[i].name] = parameters[i].values[0].valueLabel
    }
    console.log(entry);
}

process();