从没有类或id的元素中删除链接 - casperjs

时间:2015-03-29 04:31:05

标签: javascript casperjs

我目前正试图从此SITE抓取视频链接。链接位于旋转木马滑块内。 a标签没有类或ID,因此我不确定如何定位,因此casperjs可以检索href属性。如何在没有class或id属性的情况下刮掉链接?

HTML

<ul class="videos" >
    <li>
        <a href="http://www.test.com/article2/0,2817,2471677,00.asp">
            <img src="http://assets1.ignimgs.com/thumbs/compact.jpg">
            <span class="video-title">
                <div> Fujitsu ScanSnap iX100 </div>
            </span>
        </a>
    </li>
    <li>
    </li>
    <li>
    </li>
    <li>
    </li>
    <li>
    </li>
</ul>

JS

var urls = ['http://www.test.com/'];
var casper = require('casper').create({});

function linkScraper(x) {
    var page_links = [];
    for (var i = 0; i < x.length; i++) { // start for loop
        current_page = x[i];
        casper.thenOpen(x[i], function() {
            casper.then(function() {
                this.getElementsInfo('a').forEach(function(element) {
                    // skip elements that don't have a href attribute...
                    if (!element.attributes.href) {
                        return;
                    }
                    //page_links
                    page_links.push(element["attributes"]["href"]);
                });
            });
        });
    }
    return {
        pl: page_links,
    };
}

//Crawl
function stringifyResult(webpages) {
    // here linksObj contains empty lists
    var linksObj = linkScraper.call(this, webpages);
    this.then( function() {
        var data = JSON.stringify( linksObj.pl );
        this.echo(data.length + ' links found.');
    });
}

casper.start().then(function() {
    this.echo("Fetching........");
});
casper.run( stringifyResult.call(casper, urls) );

2 个答案:

答案 0 :(得分:3)

您可以通过ul.videos li a CSS选择器获取所有需要的链接:

function getLinks() {
    var links = document.querySelectorAll('ul.videos li a');
    return Array.prototype.map.call(links, function(e) {
        return e.getAttribute('href')
    });
}

完整的工作示例:

var casper = require('casper').create({}),
    links;

function getLinks() {
    var links = document.querySelectorAll('ul.videos li a');
    return Array.prototype.map.call(links, function(e) {
        return e.getAttribute('href')
    });
}

casper.start('http://www.pcmag.com/video/latest');

casper.then(function() {
    links = this.evaluate(getLinks);
});

casper.run(function() {
    this.echo(links.length + ' links found:');
    this.echo(' - ' + links.join('\n - ')).exit();
});

输出:

173 links found:
 - http://www.pcmag.com/article2/0,2817,2470070,00.asp
 - http://www.pcmag.com/article2/0,2817,2470084,00.asp
 - http://www.pcmag.com/article2/0,2817,2470087,00.asp
...
 - http://www.pcmag.com/article2/0,2817,2475543,00.asp
 - http://www.pcmag.com/article2/0,2817,2475409,00.asp
 - http://www.pcmag.com/article2/0,2817,2475359,00.asp

如何获取span文本示例:

function getSpanTexts() {
    var texts = document.querySelectorAll('ul.videos li span.video-title');
    return Array.prototype.map.call(texts, function(e) {
        return e.textContent;
    });
}

答案 1 :(得分:0)

这是部分解决方案(我不知道Casper),但想法是替换

this.getElementsInfo('a')

实际上通过类ul获取videos本身。然后你必须做逻辑导航到li,然后导航a并拉出链接。即,不要刮掉链接;抓取你感兴趣的列表元素,并可以按类识别,然后从中删除链接元素。