我想使用nodejs创建网络爬虫,但我想只抓取用户将访问的那些网址。第一个用户将使用id“createdby”登录..我也有“URL”,“队列”表在我的数据库...
我在数据访问层“dal / url.js”中编写此代码,但它无法正常工作
DAL / url.js:
exports.selectAll = function (req, res) {
db.db_connect(function (err, con) {
if (err) {
throw err;
} else {
var select = "select * from URL where CreatedBy = '" + req.params.id + "'";
con.query(select, function (err, msg) {
if (err) {
throw err;
} else {
console.log("Select All from URL");
msg.fetchAll(function (err, rows) {
if (err) {
throw err;
} else {
res.json(rows);
}
Crawler = function () {
var self = this;
this.conn = db.createConnection(config.get('db'));
this.indexed = 0;
this._url = select;
console.log(select);
this.url = select;
this.crawl = function (cb) {
this.conn.query('SELECT * FROM `queue` LIMIT 0,1', function (e, result) {
self.url = result.length > 0 ? result[0].url : select;
request(self.url, function (e, res, body) {
if (result.length > 0) {
self.conn.query('DELETE FROM `queue` WHERE `id` = ?', [result[0].id], function () {
cb();
});
} else {
cb();
}
if (!e && res.statusCode === 200) {
self.getInfo(body, result.length > 0 ? result[0].from : '');
} else {
console.log('Error requesting page %s', self.url);
}
self._url = self.url;
});
});
};
this.getInfo = function (html, from) {
var $ = cheerio.load(html);
var title = $('head title').text();
var keywords = $('head meta[name=keywords]').attr('content');
var desc = $('head meta[name=description]').attr('content');
var links = $('a');
console.log('Crawling "%s" | %s', title, this.url);
async.map(links.map(function () {
var href = $(this).attr('href');
if (href && href != self._url && !(/^#(\w)+/.test(href)) && !cutil.imageRegexp.test(href)) {
if (cutil.isExternal(href)) {
return 'INSERT INTO `queue` SET `id` = \'' + cutil.id() + '\', `url` = ' + self.conn.escape(href) + ', `from` = ' + self.conn.escape(from);
} else {
return 'INSERT INTO `queue` SET `id` = \'' + cutil.id() + '\', `url` = ' + self.conn.escape(cutil.resolveRelativeURL(href, self._url)) + ', `from` = ' + self.conn.escape(from);
}
}
return false;
}).filter(function (el) {
return !!el;
}), this.conn.query.bind(this.conn), function (e, result) {
if (e) {
console.log('Error writing queue.');
console.log(e);
}
});
this.conn.query('INSERT INTO `URL` SET ?', {
id: cutil.id(),
url: this.url,
from: from,
title: title,
keywords: keywords || '',
desc: desc || ''
}, function (e) {
if (e) {
console.log('Error indexing page %s', self.url);
console.log(e);
} else {
console.log('Successfully indexed page %s', self.url);
self.indexed++;
}
});
};
};
});
}
});
}
});
我使用的是一个简单的网络蜘蛛抓取工具,使用此链接“http://licson.net/post/create-a-simple-web-spider-in-node-js/”