我能够让刮刀去做我想做的事情,我有很多问题实际上让它循环遍历我希望它循环的页面。我认为我的问题可能在于我的for循环的放置以及它是如何执行的。
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
这是问题所在,我如何设置它以便它不仅仅设置和循环最后一页,而是所有101页?
for(var i = 1; i < 101; i++){
url = 'http://www.goodreads.com/quotes?page=' + i;
}
//
request(url, function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
})
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
**** ****编辑
在函数调用结束时更改了代码以运行res.send('Check your console!')
,如果多次调用res,app将抛出错误。还包括基于已接受答案的更改。
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []
//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
//set the scraper url
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
for(i in url){
request(url[i], function(error, response, html){
if(!error){
//use cheerio to use jquery to select DOM elements
var $ = cheerio.load(html);
//select DOM elements using jquery selectors
$('.quoteText > a').filter(function(){
var data = $(this);
author = data.text();
json.author.push(author);
// all.push(data.text());
})
//select DOM elements using jquery selectors
$('.quoteText').filter(function(){
var data = $(this);
quote = data.text();
json.quote.push(quote);
})
}
})
}
res.send('Check your console!')
})
function cleanUp(){
//loop through json object to clean up stings
for(var i = 0; i < json.quote.length; i++) {
//find the index of where the quote ends
endQuote = json.quote[i].indexOf("―")
//select only the part of the string that contains a quote
json.quote[i] = json.quote[i].substring(0, endQuote - 1);
//remove non breaking spaces from string
json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
}
//write the json file to folder
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
}
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
答案 0 :(得分:1)
在您提供的示例代码中:
var url = [];
for(var i = 1; i < 101; i++){
url.push('http://www.goodreads.com/quotes?page=' + i);
}
for循环每次循环都会覆盖url变量。
您可以通过对代码进行一些小的更改来使其工作;最简单的方法是使url成为一个数组,然后每次通过循环进入数组,这样url列表继续像下面的代码一样累积:
fs.writeFile
然后,您需要为数组中的每个项目调用您的请求函数,因为url现在包含一个包含100个项目的数组,并且还将fs.appendFile
调用更改为request
,以便每个项目的结果objectModel.find({ 'members.user_id' : {'$in' : ['asdf123lkd', 'asdf1223']} }, function(err, data) {
console.log(err,data);
})
调用get添加到output.json文件而不是覆盖它。
最后,你还应该考虑限制请求,这样你就不会抨击你正在抓取的网站的服务器。