我正在Node.js上构建一个简单的web scrapper。我也在使用Express.js,Request和Cheerio。我试图抓取以下网址:http://www.houzz.com/professionals/c/Nashville,-TN
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
url = 'http://www.houzz.com/professionals/c/Nashville,-TN';
request(url, function(error, response, html){
//console.log(html)
if(!error){
var $ = cheerio.load(html);
var title = [], contact = [], review = [], description = [];
var json = { title : "", contact : "", review: "", description: ""};
$('.pro-title').filter(function(){
var data = $(this);
title.push(data.text());
console.log(title);
json.title = title;
});
$('.pro-phone').filter(function(){
var data = $(this);
contact.push(data.text());
console.log(contact);
json.contact = contact;
});
$('.pro-review-string').filter(function(){
var data = $(this);
review.push(data.children().first().text());
json.review = review;
});
$('.pro-description').filter(function(){
var data = $(this);
description.push(data.text());
json.description = description;
});
}
fs.writeFile('houzz.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the houzz.json file');
});
res.send('Check your console!');
}) ;
});
app.listen('8081');
console.log('Port 8081');
exports = module.exports = app;
输出的houzz.json
文件的示例如下所示:
{
"title": [
"Marcelle Guilbeau, Interior Designer",
"Country Flooring DIrect",
"Eric Ross Interiors, LLC",
"Hermitage Kitchen Design Gallery",
"William Johnson Architect",
],
"contact": [
"(615) 815-9309",
"(615) 646-0366",
"(615) 472-8236",
"(615) 843-3310",
"(615) 292-4017",
],
"review": [
"77",
"1",
"14",
"14",
"15",
],
"description": [
"Marcelle takes her clients on a journey, drawing out their needs to create an oasis that reflects their personal sense of style and renews their connection to those things about...\t\t\tRead More\n\t\t\t",
"Country Flooring Direct is the local flooring option that will handle your flooring needs. Give Country Flooring Direct a call and find out why lower overhead means lower prices.\t\t\tSee my projects\n\t\t",
"Eric Ross Interiors exists to create beautiful interiors and a luxury design experience for its clients. We are committed to creating whole room environments for our clients in...\t\t\tRead More\n\t\t\t",
"We are a total design center that offers the finest in custom cabinetry, with the best possible level of creativity, design and service. We are located within Hermitage Lighting Gallery.\t\t\tSee my projects\n\t\t",
"William C. Johnson Architect, LLC is a small, full service architectural design firm. Since 1985, WCJA has helped clients achieve their design goals, from small residential...\t\t\tRead More\n\t\t\t",
]
}
如何重组houzz.json
文件并使其如下所示:
{
0:
[
title:
contact:
review:
description:
1:
[
title:
contact:
review:
description:
]
答案 0 :(得分:1)
在我看来,你正在以无序的方式抓住内容。
你应该得到每个" vcard"类,然后迭代你想要的元素。 (亲标题,亲电话等)
DOM元素的结构已经帮助您保持组织有序。
vcard
pro-title
pro-phone
pro-review-string
pro-description
vcard
pro-title
pro-phone
pro-review-string
pro-description
所以代码看起来像这样。您可能需要使用$(this).find()
var allmycards=[];
$('.vcard').each(function (i, elem) {
var title = [], contact = [], review = [], description = [];
var json = { title : "", contact : "", review: "", description: ""};
$(this).find('.pro-title').filter(function(){
var data = $(this);
title.push(data.text());
console.log(title);
json.title = title;
});
$(this).find('.pro-phone').filter(function(){
var data = $(this);
contact.push(data.text());
console.log(contact);
json.contact = contact;
});
$(this).find('.pro-review-string').filter(function(){
var data = $(this);
review.push(data.children().first().text());
json.review = review;
});
$(this).find('.pro-description').filter(function(){
var data = $(this);
description.push(data.text());
json.description = description;
});
allmycards.push(json);
});