检索标题的网络爬虫,登录用户的描述不起作用

时间:2013-07-27 07:59:57

标签: javascript node.js web web-crawler

我想使用nodejs创建网络爬虫,但我想只抓取用户将访问的那些网址。第一个用户将使用id“createdby”登录..我也有“URL”,“队列”表在我的数据库...

我在数据访问层“dal / url.js”中编写此代码,但它无法正常工作

DAL / url.js:

exports.selectAll = function (req, res) {
    db.db_connect(function (err, con) {
        if (err) {
            throw err;
        } else {
            var select = "select * from URL where CreatedBy = '" + req.params.id + "'";
            con.query(select, function (err, msg) {
                if (err) {
                    throw err;
                } else {
                    console.log("Select All from URL");
                    msg.fetchAll(function (err, rows) {
                        if (err) {
                            throw err;
                        } else {
                            res.json(rows);
                        }
                        Crawler = function () {
                            var self = this;
                            this.conn = db.createConnection(config.get('db'));
                            this.indexed = 0;
                            this._url = select;
                            console.log(select);
                            this.url = select;
                            this.crawl = function (cb) {
                                this.conn.query('SELECT * FROM `queue` LIMIT 0,1', function (e, result) {
                                    self.url = result.length > 0 ? result[0].url : select;
                                    request(self.url, function (e, res, body) {
                                        if (result.length > 0) {
                                            self.conn.query('DELETE FROM `queue` WHERE `id` = ?', [result[0].id], function () {
                                                cb();
                                            });
                                        } else {
                                            cb();
                                        }
                                        if (!e && res.statusCode === 200) {
                                            self.getInfo(body, result.length > 0 ? result[0].from : '');
                                        } else {
                                            console.log('Error requesting page %s', self.url);
                                        }
                                        self._url = self.url;
                                    });
                                });
                            };
                            this.getInfo = function (html, from) {
                                var $ = cheerio.load(html);
                                var title = $('head title').text();
                                var keywords = $('head meta[name=keywords]').attr('content');
                                var desc = $('head meta[name=description]').attr('content');
                                var links = $('a');
                                console.log('Crawling "%s" | %s', title, this.url);
                                async.map(links.map(function () {
                                    var href = $(this).attr('href');
                                    if (href && href != self._url && !(/^#(\w)+/.test(href)) && !cutil.imageRegexp.test(href)) {
                                        if (cutil.isExternal(href)) {
                                            return 'INSERT INTO `queue` SET `id` = \'' + cutil.id() + '\', `url` = ' + self.conn.escape(href) + ', `from` = ' + self.conn.escape(from);
                                        } else {
                                            return 'INSERT INTO `queue` SET `id` = \'' + cutil.id() + '\', `url` = ' + self.conn.escape(cutil.resolveRelativeURL(href, self._url)) + ', `from` = ' + self.conn.escape(from);
                                        }
                                    }
                                    return false;
                                }).filter(function (el) {
                                    return !!el;
                                }), this.conn.query.bind(this.conn), function (e, result) {
                                    if (e) {
                                        console.log('Error writing queue.');
                                        console.log(e);
                                    }
                                });
                                this.conn.query('INSERT INTO `URL` SET ?', {
                                    id: cutil.id(),
                                    url: this.url,
                                    from: from,
                                    title: title,
                                    keywords: keywords || '',
                                    desc: desc || ''
                                }, function (e) {
                                    if (e) {
                                        console.log('Error indexing page %s', self.url);
                                        console.log(e);
                                    } else {
                                        console.log('Successfully indexed page %s', self.url);
                                        self.indexed++;
                                    }
                                });
                            };
                        };
                    });
                }
            });
        }
    });

我使用的是一个简单的网络蜘蛛抓取工具,使用此链接“http://licson.net/post/create-a-simple-web-spider-in-node-js/

0 个答案:

没有答案