在某些父页面中,我需要一些子页面锚点。我想抓取所有父页面,解析它们,然后获取子锚点,跟随锚点,并获得结果。但是当我编写代码时我发现,在我关注锚点之前,锚点网址没有改变。这是我的代码:
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
//TAG 1
req.get(anchor, function(error, response, body){
//TAG 2
console.log(anchor);
//here's my result
})
}
index = index+20;
callnext(index)
}
})
}
callnext(1);
在这段代码中,如果我在TAG1位置和TAG2位置的console.log()锚点网址,它会产生不同的结果。 在TAG 1中,它是我的预期结果,但在TAG 2中,似乎只打印出父页面的第一个锚点。
我试图改变代码并提取子请求函数,结果是正确的结果。为什么?
var req = require('request');
var cheerio = require('cheerio')
var model = require('./model')
function crawlItem(url, text){
req.get(url, function(error, response, body){
console.log(url)
var inner = cheerio.load(body);
var text = inner('#text_long').text();
// model.Talk.create({ id: la, video: hr, youku_desc:text }).complete(function(err, album) {
// console.log(err);
// });
})
}
function callnext(index){
var url = 'http://www.youku.com/show_episode/id_z2c9b63e691e611e2b356.html?dt=json&divid=reload_'+index+'&__rt=1&__ro=reload_21';
var result = req.get(url, function(error, response, body){
if (!error && response.statusCode == 200) {
var patt = /暂无内容/g;
var result = patt.test(body);
if(result){
return;
}
$ = cheerio.load(body);
var children = $('div').first().children();
for(var i=0;i<children.length;i++){
var item = $(children[i]);
var anchor = $(item.find('li>a')[0]).attr('href');
var labelText = $(item.find('label')[0]).text();
// console.log(anchor);
crawlItem(anchor, labelText);
}
index = index+20;
callnext(index)
}
})
}
callnext(1);