如何从TripAdvisor收集收录日期的餐厅评分?

时间:2018-06-27 06:40:33

标签: node.js protractor tripadvisor

我想看看餐厅的收视率是如何随着时间变化的,所以我想在TripAdvisor上收集收视率​​以及用户发布的日期。我之所以考虑使用量角器,是因为它允许我以编程方式与浏览器进行交互。还有什么其他好的选项(也许基于Node.js)适合这种任务?

1 个答案:

答案 0 :(得分:0)

您可以使用Request.js(https://www.npmjs.com/package/request)库提取餐厅的html数据,然后使用Cheerio.js(https://www.npmjs.com/package/cheerio)解析生成的HTML。

这是解析评论的示例。

注意:在抓取时请记住,您可能违反了网站的条款和条件。例如,请勿每隔100毫秒重复刮擦页面,否则您将被阻止!

"use strict";

const request = require('request');
const fs = require('fs');
const cheerio = require('cheerio');
const _ = require('lodash');

/* Change to whichever! */
const restaurantUrl = "https://www.tripadvisor.ie/Restaurant_Review-g60745-d1954989-Reviews-Italian_Express_Pizzeria-Boston_Massachusetts.html";

function getReviewDate(reviewRoot) {
    return _.get(reviewRoot, 'children[1].attribs.title', null);
}

function getReviewTitle(reviewRoot) {

    return _.get(reviewRoot.parent, 'children[1].children[0].children[0].children[0].data', null);
}

function getReviewDetails(htmlData) {

   const $ = cheerio.load(htmlData);

   var result = $('div.rating.reviewItemInline');

   let reviews = [];

   for(let resultIndex = 0; resultIndex < result.length; resultIndex++)
   {
       var review = { date: getReviewDate(result[resultIndex]), title: getReviewTitle(result[resultIndex])};
       reviews.push(review);
   }

   return reviews;
}

function getReviewSummaries(htmlData) {

    const $ = cheerio.load(htmlData);

    var result = $('label.filterLabel');

    var reviewObj = [];

    for(var i = 0; i < 5; i++) {

        result[i].children.forEach((c) => { 
            if (c.children) c.children.forEach ( (gr) => { 
                if (gr.type === 'text')  { 
                    if (reviewObj[i] === undefined) reviewObj[i] = {};
                    if (gr.parent && gr.parent.attribs && gr.parent.attribs.class === 'row_label') { 
                        reviewObj[i].name = gr.data;
                    } else {
                        reviewObj[i].value = gr.data;
                    }
                }
            })
        });
    }

    return reviewObj;
}

var options = {
    url: restaurantUrl,
    method: "get"
};

console.log('Requesting page..');

request(options, function (error, response, body) {

    if (error) {
        console.error('error:', error);
    } else {
        console.log('Response: StatusCode:', response && response.statusCode);
        let reviews = getReviewSummaries(body);
        console.log('Review summary: \r\n', reviews);
        let details = getReviewDetails(body);
        console.log("\r\n");
        console.log('Review details: \r\n', details);
    }
});

您将看到这样的结果,现在显示审阅日期:

Response: StatusCode: 200
Review summary:
 [ { name: 'Excellent', value: '554' },
  { name: 'Very good', value: '92' },
  { name: 'Average', value: '32' },
  { name: 'Poor', value: '9' },
  { name: 'Terrible', value: '6' } ]


Review details:
 [ { date: '24 June 2018',
    title: 'Whatever you choose, you can\'t go wrong' },
  { date: '23 June 2018', title: 'That\'s Amore!' },
  { date: '20 June 2018', title: 'Amazing pasta' },
  { date: '20 June 2018', title: 'Best Pizza' },
  { date: '18 June 2018', title: 'Italian food' },
  { date: '16 June 2018', title: 'Boston Dinner Adventure' },
  { date: '11 June 2018',
    title: 'Delicious food - friendly service' },
  { date: '3 June 2018',
    title: 'Hearty, Homemade, and Delicious!!!' },
  { date: '31 May 2018',
    title: 'Amazing dinner - YOU WON\'T LEAVE HUNGRY!!!' },
  { date: '31 May 2018', title: 'Homemade ' } ]