论坛标题网络刮刀

时间:2016-08-27 15:34:33

标签: javascript jquery node.js web-scraping cheerio

我正在编写一个简单的网络抓取工具,可以从论坛中提取帖子标题,用户名和上次发布时间。

问题是刮刀只拉动表格中的最后一个条目。

例如: 如果表格的结构是这样的:

<tbody>
<tr class="">
  <td class="title">    
    <a href="/forums/marketplace/8827" title="View full post details">Title number 1</a>
  </td>
  <td class="author"><a href="/members/pursu" title="View member, pursu">pursu</a></td>
  <td class="count">0</td>
  <td class="last_post">9 minutes ago</td>
</tr>
<tr class="color2">
  <td class="title">

    <a href="/forums/marketplace/8826" title="View full post details">Title number 2</a>
  </td>
  <td class="author"><a href="/members/colinatx" title="View member, colinatx">colinatx</a></td>
  <td class="count">0</td>
  <td class="last_post">9 minutes ago</td>
</tr>
<tr class="">
  <td class="title">    
    <a href="/forums/marketplace/8785" title="View full post details">Title number 3</a>
  </td>
  <td class="author"><a href="/members/Object117" title="View member, Object117">Object117</a></td>
  <td class="count">11</td>
  <td class="last_post">about 1 hour ago</td>
</tr>
</tbody>

将写入.json输出文件的结果是

{
    "title": "Title number 3",
    "author": "Object117",
    "lastpost": "about 1 hour ago"
}

相反它应该是这样的:

{
    "title": "Title number 1",
    "author": "pursu",
    "lastpost": "9 minutes ago"
}
{
    "title": "Title number 2",
    "author": "colinatx",
    "lastpost": "9 minutes ago"
}
{
    "title": "Title number 3",
    "author": "Object117",
    "lastpost": "about 1 hour ago"
}

我的JavaScript:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){

    //This is the URL to pull data from
    url = 'http://www.pedalroom.com/forums/marketplace';

    // The first parameter is our URL

    // The callback function takes 3 parameters, an error, response status code and the html
     request(url, function(error, response, html){
          if(!error){

              //pulling HTML
            var $ = cheerio.load(html);

              //Variables that capture data
            var title, author, lastpost;
            var json = { title : "", author : "", lastpost : ""};

            $('.title').filter(function(){

                var data = $(this);

                title = data.children().first().text();

                json.title = title;
            })
             $('.author').filter(function(){

                var data = $(this);

                author = data.children().first().text();

                json.author = author;
            })
             $('.last_post').filter(function(){

                var data = $(this);

                lastpost = data.text();

                json.lastpost = lastpost;
            })
     }
         fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){

             console.log('File successfully written! - Check your project directory for the output.json file');

         })

         // Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
         res.send('Check your console!')

     });
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

我需要以某种方式循环代码或其他东西吗?

1 个答案:

答案 0 :(得分:3)

在你的代码中,你只捕获第一行的第一个元素,因为你没有在每一行上循环。

以下是工作代码:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){

    //This is the URL to pull data from
    url = 'http://www.pedalroom.com/forums/marketplace';

    // The first parameter is our URL

    // The callback function takes 3 parameters, an error, response status code and the html
    request(url, function(error, response, html){
        if(!error){

            //pulling HTML
            var $ = cheerio.load(html);

            var data = [];

            /**
             * New code starts here
             */
            // For each row of the table
            $('.topics tr').each(function(index, element){

                // If title is present on this line, write it into the json
                if($(this).find('.title a').length > 0)
                    data.push({
                        title: $(this).find('.title a').html(),
                        author: $(this).find('.author a').html(),
                        lastpost: $(this).find('.last_post').html()
                    });
            });
            /**
             * Ends here :D
             */
        }
        fs.writeFile('output.json', JSON.stringify(data, null, 4), function(err){

            console.log('File successfully written! - Check your project directory for the output.json file');

        })

        // Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
        res.send('Check your console!')

    });
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;