我想看看餐厅的收视率是如何随着时间变化的,所以我想在TripAdvisor上收集收视率以及用户发布的日期。我之所以考虑使用量角器,是因为它允许我以编程方式与浏览器进行交互。还有什么其他好的选项(也许基于Node.js)适合这种任务?
答案 0 :(得分:0)
您可以使用Request.js(https://www.npmjs.com/package/request)库提取餐厅的html数据,然后使用Cheerio.js(https://www.npmjs.com/package/cheerio)解析生成的HTML。
这是解析评论的示例。
注意:在抓取时请记住,您可能违反了网站的条款和条件。例如,请勿每隔100毫秒重复刮擦页面,否则您将被阻止!
"use strict";
const request = require('request');
const fs = require('fs');
const cheerio = require('cheerio');
const _ = require('lodash');
/* Change to whichever! */
const restaurantUrl = "https://www.tripadvisor.ie/Restaurant_Review-g60745-d1954989-Reviews-Italian_Express_Pizzeria-Boston_Massachusetts.html";
function getReviewDate(reviewRoot) {
return _.get(reviewRoot, 'children[1].attribs.title', null);
}
function getReviewTitle(reviewRoot) {
return _.get(reviewRoot.parent, 'children[1].children[0].children[0].children[0].data', null);
}
function getReviewDetails(htmlData) {
const $ = cheerio.load(htmlData);
var result = $('div.rating.reviewItemInline');
let reviews = [];
for(let resultIndex = 0; resultIndex < result.length; resultIndex++)
{
var review = { date: getReviewDate(result[resultIndex]), title: getReviewTitle(result[resultIndex])};
reviews.push(review);
}
return reviews;
}
function getReviewSummaries(htmlData) {
const $ = cheerio.load(htmlData);
var result = $('label.filterLabel');
var reviewObj = [];
for(var i = 0; i < 5; i++) {
result[i].children.forEach((c) => {
if (c.children) c.children.forEach ( (gr) => {
if (gr.type === 'text') {
if (reviewObj[i] === undefined) reviewObj[i] = {};
if (gr.parent && gr.parent.attribs && gr.parent.attribs.class === 'row_label') {
reviewObj[i].name = gr.data;
} else {
reviewObj[i].value = gr.data;
}
}
})
});
}
return reviewObj;
}
var options = {
url: restaurantUrl,
method: "get"
};
console.log('Requesting page..');
request(options, function (error, response, body) {
if (error) {
console.error('error:', error);
} else {
console.log('Response: StatusCode:', response && response.statusCode);
let reviews = getReviewSummaries(body);
console.log('Review summary: \r\n', reviews);
let details = getReviewDetails(body);
console.log("\r\n");
console.log('Review details: \r\n', details);
}
});
您将看到这样的结果,现在显示审阅日期:
Response: StatusCode: 200
Review summary:
[ { name: 'Excellent', value: '554' },
{ name: 'Very good', value: '92' },
{ name: 'Average', value: '32' },
{ name: 'Poor', value: '9' },
{ name: 'Terrible', value: '6' } ]
Review details:
[ { date: '24 June 2018',
title: 'Whatever you choose, you can\'t go wrong' },
{ date: '23 June 2018', title: 'That\'s Amore!' },
{ date: '20 June 2018', title: 'Amazing pasta' },
{ date: '20 June 2018', title: 'Best Pizza' },
{ date: '18 June 2018', title: 'Italian food' },
{ date: '16 June 2018', title: 'Boston Dinner Adventure' },
{ date: '11 June 2018',
title: 'Delicious food - friendly service' },
{ date: '3 June 2018',
title: 'Hearty, Homemade, and Delicious!!!' },
{ date: '31 May 2018',
title: 'Amazing dinner - YOU WON\'T LEAVE HUNGRY!!!' },
{ date: '31 May 2018', title: 'Homemade ' } ]