以下代码是对soupselect demo example的修改。 它基本上取一些html并打印一个链接列表并将它们存储在一个变量中:
crawl = function(host)
var select = require('soupselect').select,
htmlparser = require("htmlparser"),
http = require('http'),
sys = require('sys');
// fetch some HTML...
var http = require('http');
var client = http.createClient(80, host);
var request = client.request('GET', '/',{'host': host});
var newPages = []
request.on('response', function (response) {
response.setEncoding('utf8');
var body = "";
response.on('data', function (chunk) {
body = body + chunk;
});
response.on('end', function() {
// now we have the whole body, parse it and select the nodes we want...
var handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err) {
sys.debug("Error: " + err);
} else {
// soupselect happening here...
var titles = select(dom, 'a.title');
sys.puts("Top stories from reddit");
titles.forEach(function(title) {
sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
newPages.push(title.attribs.href);
})
}
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(body);
});
});
request.end();
}
我真正想要的是这个函数返回newPages
我希望能够说newPages = crawl(host)
;麻烦是我不确定这是否有意义或在何处放置return语句。我看到newPages在请求结束之前存在,但在请求结束后为空。
如何使该函数的返回值为newPages
?
答案 0 :(得分:1)
将您的功能签名更改为
crawl = function(host, done)
并将您的函数体更新为:
titles.forEach(function(title) {
sys.puts("- " + title.children[0].raw + " [" + title.attribs.href + "]\n");
newPages.push(title.attribs.href);
done(newPages);
})
然后您可以像这样调用抓取:
var processNewPages = function(pages){
// do something with pages here
...
};
crawl(host, processNewPages);
答案 1 :(得分:1)
我喜欢使用request
,cheerio
和async
模块来抓取网站。这段代码更短,我觉得更具可读性。
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
function crawl(url, contentSelector, linkSelector, callback) {
var results = [];
var visited = {};
var queue = async.queue(crawlPage, 5); // crawl 5 pages at a time
queue.drain = callback; // will be called when finished
function crawlPage(url, done) {
// make sure to visit each page only once
if (visited[url]) return done(); else visited[url] = true;
request(url, function(err, response, body) {
if (!err) {
var $ = cheerio.load(body); // "jQuery"
results = results.concat(contentSelector($)); // add something to the results
queue.push(linkSelector($)); // add links found on this page to the queue
}
done();
});
}
}
function getStoryTitles($) {
return $('a.title').map(function() { return $(this).text(); });
}
function getStoryLinks($) {
return $('a.title').map(function() { return $(this).attr('href'); });
}
crawl('http://www.reddit.com', getStoryTitles, getStoryLinks, function(stories) {
console.log(stories); // all stories!
});
最后,你会得到一个你可能想要的所有故事的数组,它只是一个不同的语法。您可以像AndyD建议的那样更新您的功能,使其行为类似。
将来,您将能够使用生成器,它可以让您在没有回调功能的情况下获得故事,这更像您想要的。有关详细信息,请参阅this article。
function* crawl(url) {
// do stuff
yield story;
}
var crawler = crawl('http://www.reddit.com');
var firstStory = crawler.next();
var secondStory = crawler.next();
// ...