网络抓取工具不会检索标题和说明

时间:2013-07-27 08:38:21

标签: node.js web web-crawler

我想在nodejs中创建简单的网络爬虫,它将从登录成员访问的URL中提取tittle和描述..如果他想要抓取,第一个成员将登录..这将由“createdby”完成ColumnID的 这是我的代码, DAL / url.js:

   var mysql = require('mysql-libmysqlclient');
      var cheerio = require('cheerio');
      var async = require('async');
      var db = require('mysql');
      var generator = require('../bal/urlgenerator');
      var crawler = require('../bal/url');

      var cutil = require('../util/crawler_utils');              
      var request = require('request');
      var fs = require("fs");

      var path = require("path");
      var domain = "http://www.pin.tl/" ;


      var configs = fs.readFileSync(path.resolve(__dirname, "../config.json"));
      if (configs) {
       configs = JSON.parse(configs.toString());
         }
     else {
     throw error;
       }

       exports.insert = function (req,res) {

       var surl = generator.generateURL().toString();

       db.db_connect(function (err, con) {
       if (err) {
        throw err;
        }

       else {

       var custom = req.body.curl;
       if(!custom=="")
       {
       var insertQry = "INSERT INTO URL(idURL, URL, Name, ShortURL, CreatedBy, CreatedDate, Code, idCategoryURL, IP) values(" +
                "uuid(),'" + req.body.url + "', '' , '"+ domain+custom +"', '"+req.params.userid+"', Now(), '"+custom+"', '15e90e46-ed13-11e2-8bca-74867a028220', '"+req.connection.remoteAddress+"')";
         console.log(insertQry)
         con.query(insertQry, function (err, msg) {
         if (err) {

          throw err;
          } else {
          console.log("Data successfully inserted");

          }

          });
           }

          else {
        var insertQry = "INSERT INTO URL(idURL, URL, Name, ShortURL, CreatedBy, CreatedDate, Code, idCategoryURL, IP) values(" +
                "uuid(),'" + req.body.url + "', '' , '"+ domain+surl +"', '"+req.params.userid+"', Now(), '"+surl+"', '15e90e46-ed13-11e2-8bca-74867a028220', '"+req.connection.remoteAddress+"')";
         console.log(insertQry)
        con.query(insertQry, function (err, msg) {
        if (err) {
        throw  err;
        } else {
        console.log("Data successfully inserted");
          res.send(domain+surl);
         res.send(req.body.FullName);
          }

          });
          }

          }
         });
        }

      exports.selectAll = function (req, res) {
      db.db_connect(function (err, con) {
    if (err) {
        throw err;
    }

    else {

      var select ="select * from URL where CreatedBy = '"+req.params.id+"'";
      con.query(select, function (err, msg) {
      if (err) {
     throw  err;
        }
            else {
       console.log("Select All from URL");
       msg.fetchAll(function(err, rows){

        if(err){
        throw err;
        }

         else{
        res.json(rows);

       }
       Crawler = function(){
       var self = this;
       this.conn = db.createConnection(config.get('db'));
       this.indexed = 0;
        this._url = select;
       console.log(select);
       this.url = select;
       this.crawl = function(cb){
        this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){
       self.url = result.length > 0 ? result[0].url : select;
       request(self.url,function(e,res,body){
        if(result.length > 0){
        self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){
        cb();
        });
         }
        else {
        cb();
        }
        if(!e && res.statusCode === 200){
        self.getInfo(body,result.length > 0 ? result[0].from : '');
       }
       else {
       console.log('Error requesting page %s',self.url);
       }
       self._url = self.url;
        });
       });
       };
      this.getInfo = function(html,from){
       var $ = cheerio.load(html);
       var title = $('head title').text();
       var keywords = $('head meta[name=keywords]').attr('content');
       var desc = $('head meta[name=description]').attr('content');
      var links = $('a');
       console.log('Crawling "%s" | %s',title,this.url);
        async.map(links.map(function(){
        var href = $(this).attr('href');
        if(href && href != self._url && !(/^#(\w)+/.test(href)) && !cutil.imageRegexp.test(href)){
        if(cutil.isExternal(href)){
                                        return 'INSERT INTO `queue` SET `id` = ``\''+cutil.id()+'\', `url` = '+self.conn.escape(href)+', `from` = '+self.conn.escape(from);
          }
         else {
         return 'INSERT INTO `queue` SET `id` = \''+cutil.id()+'\', `url` = '+self.conn.escape(cutil.resolveRelativeURL(href,self._url))+', `from` = '+self.conn.escape(from);
       }
        }
        return false;
         }).filter(function(el){
        return !!el;
        })
         ,this.conn.query.bind(this.conn),function(e,result){
        if(e){
        console.log('Error writing queue.');
        console.log(e);
         }
          });
         this.conn.query('INSERT INTO `URL` SET ?',{
        id:cutil.id(),
        url:this.url,
        from:from,
        title:title,
        keywords:keywords || '',
       desc:desc || ''
       },function(e){
       if(e){
       console.log('Error indexing page %s',self.url);
        console.log(e);
        }
        else {
       console.log('Successfully indexed page %s',self.url);
       self.indexed++;
       }
       });
       };
       };
       });

      }

      });

       }


       });

       }

0 个答案:

没有答案