邮递员链接检查器爬行在一个页面上

时间:2017-12-11 15:21:54

标签: java api web-crawler postman linkchecker

我正在使用邮递员链接检查器(http://blog.getpostman.com/2017/06/23/check-for-broken-links-on-your-website-using-a-postman-collection/)。有没有办法检查xx页面上的所有链接,而不是抓取这些链接?

// get environment variables
var start_url = postman.getEnvironmentVariable('start_url');
var root_url = postman.getEnvironmentVariable('root_url');
var links = JSON.parse(postman.getEnvironmentVariable('links'));
var url = postman.getEnvironmentVariable('url');
var index = parseInt(postman.getEnvironmentVariable('index'));

// increment index counter to access links in array to check
index = index + 1;

// test if link works
if (responseCode.code > 400) {
    //console.log("This link is broken: ", url);
    tests[("This link is broken: ", url)] = false;
} else {
    tests["Link works"] = true;
}

// if the current url includes the start_url, then this is an internal link and we should crawl it for more links
if (url.includes(start_url)) {

    // load the response body as HTML using cheerio, get the <a> tags
    var $ = cheerio.load(responseBody);

    $('a').each(function (index) {

        var link = $(this).attr('href');

        // add links to the links array if not already in there
        // if you have additional links you would like to exclude, for example, ads, you can add this criteria as well
        if (!links.includes(link)) {
            links.push(link);
        }
    });
}

// if we've gone through all the links, return early
if (links.length - 1 === index) {
    console.log('no more links to check');
    return;
}

// if link is a relative one, prepend with root_url
url = links[index]
if (! /^https?:\/\//.test(url)) {
    url = root_url + url;
}

// update environment variable values
postman.setEnvironmentVariable("links", JSON.stringify(links));
postman.setEnvironmentVariable("url", url);
postman.setEnvironmentVariable("index", index);

// continue calling the same request until all links are checked
postman.setNextRequest('Check URL');

0 个答案:

没有答案