我试图编写一个脚本来从公共Codecademy配置文件中删除用户的成就。目前,这就是我尝试解析数据的方式(使用Cheerio后端,Node.js):
var express = require('express');
var cheerio = require('cheerio');
var request = require('request');
var app = express();
app.get('/scrape/:username', function(req, res){
var user = req.params.username;
console.log('Processing request for ' + user);
url = 'http://www.codecademy.com/' + user + '/achievements';
request(url, function(error, response, body){
if (!error){
var $ = cheerio.load(body);
var title, date
var json = {
achievements: [],
meta: {
request: {
user: user,
time: Date.now(),
},
server: {
version: 1,
contact: 'benedict@ovalbit.com'
}
}
};
console.log('Running parser and scraping achievements.');
$('.achievement-card').each(function() {
var data = $(this);
title = data.children('h5').text();
date = data.find('small.text--ellipsis').text();
console.log('Title: ' + title);
console.log('Date: ' + date);
json.achievements.push({
title: title,
date: date
});
});
res.type('application/json');
res.json(json);
}
});
});
app.listen('3006');
console.log('Running on port 3006.');
供参考,this是个人资料页面的样子:
但是,我的每个功能都不会运行。有什么想法吗?
答案 0 :(得分:1)
您似乎错误地生成了错误的网址,因为您错过了用户名前的/users/
部分。我刚刚在您的示例成就页面上使用了该选择器,它适用于我(节点v0.10.30,cheerio v0.17.0,请求v2.46.0):
var request = require('request'),
cheerio = require('cheerio');
var url = 'http://www.codecademy.com/users/BenedictLewis/achievements';
request(url, function(err, res, body) {
var $ = cheerio.load(body);
$('.achievement-card').each(function() {
var data = $(this);
title = data.children('h5').text();
date = data.find('small.text--ellipsis').text();
console.log('Title: ' + title);
console.log('Date: ' + date + '\n');
});
});
输出:
Title: Introduction to 'For' Loops in JS Date: Feb 1, 2014 Title: 50 Exercises Date: Feb 1, 2014 Title: Build "Rock, Paper, Scissors" Date: Jan 18, 2014 Title: Introduction to Functions in JS Date: Jan 18, 2014 Title: 25 points earned in one day Date: Jan 11, 2014 Title: Choose Your Own Adventure! Date: Jan 11, 2014 Title: 25 Exercises Date: Jan 11, 2014 Title: Getting Started with Programming Date: Jan 11, 2014 Title: 10 Exercises Date: Jan 11, 2014 Title: First Lesson Date: Jan 11, 2014 Title: Max Streak Count of 1 Date: Jan 11, 2014