使用节点web scraper循环遍历DOM元素时遇到问题

时间:2015-06-16 03:52:19

标签: javascript jquery node.js

我能够让刮刀去做我想做的事情,我有很多问题实际上让它循环遍历我希望它循环的页面。我认为我的问题可能在于我的for循环的放置以及它是如何执行的。

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};

//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
    //set the scraper url

这是问题所在,我如何设置它以便它不仅仅设置和循环最后一页,而是所有101页?

    for(var i = 1; i < 101; i++){
          url = 'http://www.goodreads.com/quotes?page=' + i;
    }

//

    request(url, function(error, response, html){
        if(!error){
            //use cheerio to use jquery to select DOM elements
            var $ = cheerio.load(html);

            //select DOM elements using jquery selectors
            $('.quoteText > a').filter(function(){
                var data = $(this);
                author = data.text();

                json.author.push(author);
                // all.push(data.text());
            })
            //select DOM elements using jquery selectors
            $('.quoteText').filter(function(){
                var data = $(this);
                quote = data.text();

                json.quote.push(quote);
            })
        }
        //loop through json object to clean up stings
        for(var i = 0; i < json.quote.length; i++) {
            //find the index of where the quote ends
            endQuote = json.quote[i].indexOf("―")
            //select only the part of the string that contains a quote
            json.quote[i] = json.quote[i].substring(0, endQuote - 1);
            //remove non breaking spaces from string
            json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
        }
        //write the json file to folder 
        fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
            console.log('File successfully written! - Check your project directory for the output.json file');
        })

        res.send('Check your console!')
    })
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

**** ****编辑

在函数调用结束时更改了代码以运行res.send('Check your console!'),如果多次调用res,app将抛出错误。还包括基于已接受答案的更改。

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []

//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
    //set the scraper url
    for(var i = 1; i < 101; i++){
          url.push('http://www.goodreads.com/quotes?page=' + i);
    }

    for(i in url){
        request(url[i], function(error, response, html){
            if(!error){
                //use cheerio to use jquery to select DOM elements
                var $ = cheerio.load(html);

                //select DOM elements using jquery selectors
                $('.quoteText > a').filter(function(){
                    var data = $(this);
                    author = data.text();

                    json.author.push(author);
                    // all.push(data.text());
                })
                //select DOM elements using jquery selectors
                $('.quoteText').filter(function(){
                    var data = $(this);
                    quote = data.text();

                    json.quote.push(quote);
                })
            }
        })
    }

    res.send('Check your console!')
})

function cleanUp(){
    //loop through json object to clean up stings
    for(var i = 0; i < json.quote.length; i++) {
        //find the index of where the quote ends
        endQuote = json.quote[i].indexOf("―")
        //select only the part of the string that contains a quote
        json.quote[i] = json.quote[i].substring(0, endQuote - 1);
        //remove non breaking spaces from string
        json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
    }
    //write the json file to folder 
    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
        console.log('File successfully written! - Check your project directory for the output.json file');
    })
}


app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

1 个答案:

答案 0 :(得分:1)

在您提供的示例代码中:

var url = [];
for(var i = 1; i < 101; i++){
    url.push('http://www.goodreads.com/quotes?page=' + i);
}

for循环每次循环都会覆盖url变量。

您可以通过对代码进行一些小的更改来使其工作;最简单的方法是使url成为一个数组,然后每次通过循环进入数组,这样url列表继续像下面的代码一样累积:

fs.writeFile

然后,您需要为数组中的每个项目调用您的请求函数,因为url现在包含一个包含100个项目的数组,并且还将fs.appendFile调用更改为request,以便每个项目的结果objectModel.find({ 'members.user_id' : {'$in' : ['asdf123lkd', 'asdf1223']} }, function(err, data) { console.log(err,data); }) 调用get添加到output.json文件而不是覆盖它。

最后,你还应该考虑限制请求,这样你就不会抨击你正在抓取的网站的服务器。