我正在使用Node.js,Express.js和Cheerio进行网页抓取,我正在尝试从这段html中获取链接列表:
<div id="tavoleScript">
<div style="display: block;padding-bottom:0; border:0;" id="box1">
<ul style="padding-bottom:10px;" class="ul_chiuso">
<li><a href="javascript:mostra(1)" tabindex="50">Anno 2016 - Coperture per singolo antigene</a></li>
</ul>
<noscript>
<ul style="padding-bottom:5px;" class="ul_09">
<li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_0_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 24 mesi</a> (pdf, 279.2 Kb )
</li>
<li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_1_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 36 mesi</a> (pdf, 279.2 Kb )
</li>
<li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_2_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 5-6 anni</a> (pdf, 234.8 Kb )
</li>
<li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_3_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 16 anni</a> (pdf, 256.9 Kb )
</li>
<li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_4_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 18 anni</a> (pdf, 256.9 Kb )
</li>
<li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_5_fileAllegati_itemFile_5_file.pdf" tabindex="50">Approfondimento: le coperture vaccinali dell'età pediatrica e dell'adolescente</a> (pdf, 218.9 Kb )
</li>
</ul>
</noscript>
</div>
</div>
这是我的代码:
var links = [];
request(url, function(err, resp, html) {
if(err) {
throw err;
}
var $ = cheerio.load(html);
$('.ul_09 li a').each(function() {
var link = $(this).attr('href');
links.push({"link": link});
});
});
console.log(links);
所以我对每个href
元素的a
属性感兴趣。
当我打印links
对象时,我得到一个空数组[]
。
我该如何解决?
谢谢!
我试过了:
async function pdfUrls(req, res) {
request(url, function(err, res, html) {
if(err) {
throw err;
}
var $ = cheerio.load(html);
/*let listOfLinks = $('noscript > ul > li > a').map(function(i, el) {
return $(this).attr('href');
}).toArray();
links = listOfLinks;
console.log('1', links);*/
// scraping
$('.ul_09 li a').each(function() {
let link = await $(this).attr('href');
links.push({"link": link});
});
console.log('1', links);
}); // end request
console.log('2', links);
}
但是我收到了这个错误:
let link = await $(this).attr('href'); ^ SyntaxError: Unexpected identifier at createScript (vm.js:80:10) at Object.runInThisContext (vm.js:139:10) at Module._compile (module.js:607:28) at Object.Module._extensions..js (module.js:654:10) at Module.load (module.js:556:32) at tryModuleLoad (module.js:499:12) at Function.Module._load (module.js:491:3) at Module.require (module.js:587:17) at require (internal/module.js:11:18) at Object.<anonymous> (C:\...\app.js:5:29)
这是我的代码:
// main page of gov coverage
var mainUrl = 'http://www.salute.gov.it/portale/documentazione/p6_2_8_3_1.jsp?id=20';
// array of pdf links
var links = [];
methods.download = function(req, res) {
pdfUrls();
console.log('links 3', links);
};
function pdfUrls(req, res) {
request(mainUrl, function(err, res, html) {
if(err) {
throw err;
}
var $ = cheerio.load(html);
// scraping first method
let listOfLinks = $('noscript > ul > li > a').map(function(i, el) {
return $(this).attr('href');
}).toArray();
links = listOfLinks;
console.log('links 1', links);
// scraping second method
/*$('.ul_09 li a').each(function() {
let link = $(this).attr('href');
links.push({"link": link});
});
console.log('links 1', links);*/
});
console.log('links 2', links);
}
我使用代码的注释部分和非注释部分运行它。结果总是这样:
links 2 []
links 3 []
links 1 []
我不明白究竟是什么问题。
答案 0 :(得分:0)
您可以尝试这样的事情:
request(url, function(err, resp, html) {
if(err) {
throw err;
}
const $ = cheerio.load(html);
let listOfLinks = $('noscript > ul > li > a').map(function(i, el) {
return $(this).attr('href');
}).toArray();
links = listOfLinks;
console.log(links);
});