使用Cheerio从html获取链接

时间:2018-03-08 11:10:35

标签: node.js web-scraping cheerio

我正在使用Node.js,Express.js和Cheerio进行网页抓取,我正在尝试从这段html中获取链接列表:

<div id="tavoleScript">
    <div style="display: block;padding-bottom:0; border:0;" id="box1">
        <ul style="padding-bottom:10px;" class="ul_chiuso">
            <li><a href="javascript:mostra(1)" tabindex="50">Anno 2016 - Coperture per singolo antigene</a></li>
        </ul>
        <noscript>
            <ul style="padding-bottom:5px;" class="ul_09">
                <li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_0_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 24 mesi</a> (pdf, 279.2 Kb )
                </li>
                <li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_1_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 36 mesi</a> (pdf, 279.2 Kb )
                </li>
                <li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_2_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 5-6 anni</a> (pdf, 234.8 Kb )
                </li>
                <li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_3_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 16 anni</a> (pdf, 256.9 Kb )
                </li>
                <li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_4_fileAllegati_itemFile_5_file.pdf" tabindex="50">Anno 2016 - Coperture a 18 anni</a> (pdf, 256.9 Kb )
                </li>
                <li style="padding-bottom:5px;"><a href="/imgs/C_17_tavole_20_allegati_iitemAllegati_5_fileAllegati_itemFile_5_file.pdf" tabindex="50">Approfondimento: le coperture vaccinali dell'età pediatrica e dell'adolescente</a> (pdf, 218.9 Kb )
                </li>
            </ul>
        </noscript>
    </div>
</div>

这是我的代码:

var links = [];

request(url, function(err, resp, html) {
    if(err) {
        throw err;
    }
    var $ = cheerio.load(html);

    $('.ul_09 li a').each(function() {
        var link = $(this).attr('href');
        links.push({"link": link});
    });
});
console.log(links);

所以我对每个href元素的a属性感兴趣。 当我打印links对象时,我得到一个空数组[]

我该如何解决?

谢谢!

编辑1

我试过了:

async function pdfUrls(req, res) {
    request(url, function(err, res, html) {
        if(err) {
            throw err;
        }
        var $ = cheerio.load(html);

        /*let listOfLinks = $('noscript > ul > li > a').map(function(i, el) {
            return $(this).attr('href');
        }).toArray();
        links = listOfLinks;
        console.log('1', links);*/


        // scraping 
        $('.ul_09 li a').each(function() {
            let link = await $(this).attr('href');
            links.push({"link": link});
        });
        console.log('1', links);

    }); // end request
    console.log('2', links);
} 

但是我收到了这个错误:

                    let link = await $(this).attr('href');
                                     ^ SyntaxError: Unexpected identifier
at createScript (vm.js:80:10)
at Object.runInThisContext (vm.js:139:10)
at Module._compile (module.js:607:28)
at Object.Module._extensions..js (module.js:654:10)
at Module.load (module.js:556:32)
at tryModuleLoad (module.js:499:12)
at Function.Module._load (module.js:491:3)
at Module.require (module.js:587:17)

at require (internal/module.js:11:18)
at Object.<anonymous> (C:\...\app.js:5:29)

编辑2

这是我的代码:

// main page of gov coverage 
var mainUrl = 'http://www.salute.gov.it/portale/documentazione/p6_2_8_3_1.jsp?id=20';

// array of pdf links
var links = [];

methods.download = function(req, res) {
    pdfUrls();
    console.log('links 3', links);
}; 


function pdfUrls(req, res) {
    request(mainUrl, function(err, res, html) {
        if(err) {
            throw err;
        }
        var $ = cheerio.load(html);

        // scraping first method
        let listOfLinks = $('noscript > ul > li > a').map(function(i, el) {
            return $(this).attr('href');
        }).toArray();
        links = listOfLinks;
        console.log('links 1', links);

        // scraping second method
        /*$('.ul_09 li a').each(function() {
            let link = $(this).attr('href');
            links.push({"link": link});
        });
        console.log('links 1', links);*/
    });
    console.log('links 2', links);
} 

我使用代码的注释部分和非注释部分运行它。结果总是这样:

links 2 []
links 3 []
links 1 []

我不明白究竟是什么问题。

1 个答案:

答案 0 :(得分:0)

您可以尝试这样的事情:

request(url, function(err, resp, html) {
    if(err) {
        throw err;
    }
    const $ = cheerio.load(html);

    let listOfLinks = $('noscript > ul > li > a').map(function(i, el) {
        return $(this).attr('href');
      }).toArray();

     links = listOfLinks;

     console.log(links);
});