我目前正试图从此SITE抓取视频链接。链接位于旋转木马滑块内。 a
标签没有类或ID,因此我不确定如何定位,因此casperjs可以检索href
属性。如何在没有class或id属性的情况下刮掉链接?
HTML
<ul class="videos" >
<li>
<a href="http://www.test.com/article2/0,2817,2471677,00.asp">
<img src="http://assets1.ignimgs.com/thumbs/compact.jpg">
<span class="video-title">
<div> Fujitsu ScanSnap iX100 </div>
</span>
</a>
</li>
<li>
</li>
<li>
</li>
<li>
</li>
<li>
</li>
</ul>
JS
var urls = ['http://www.test.com/'];
var casper = require('casper').create({});
function linkScraper(x) {
var page_links = [];
for (var i = 0; i < x.length; i++) { // start for loop
current_page = x[i];
casper.thenOpen(x[i], function() {
casper.then(function() {
this.getElementsInfo('a').forEach(function(element) {
// skip elements that don't have a href attribute...
if (!element.attributes.href) {
return;
}
//page_links
page_links.push(element["attributes"]["href"]);
});
});
});
}
return {
pl: page_links,
};
}
//Crawl
function stringifyResult(webpages) {
// here linksObj contains empty lists
var linksObj = linkScraper.call(this, webpages);
this.then( function() {
var data = JSON.stringify( linksObj.pl );
this.echo(data.length + ' links found.');
});
}
casper.start().then(function() {
this.echo("Fetching........");
});
casper.run( stringifyResult.call(casper, urls) );
答案 0 :(得分:3)
您可以通过ul.videos li a
CSS选择器获取所有需要的链接:
function getLinks() {
var links = document.querySelectorAll('ul.videos li a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href')
});
}
完整的工作示例:
var casper = require('casper').create({}),
links;
function getLinks() {
var links = document.querySelectorAll('ul.videos li a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href')
});
}
casper.start('http://www.pcmag.com/video/latest');
casper.then(function() {
links = this.evaluate(getLinks);
});
casper.run(function() {
this.echo(links.length + ' links found:');
this.echo(' - ' + links.join('\n - ')).exit();
});
输出:
173 links found:
- http://www.pcmag.com/article2/0,2817,2470070,00.asp
- http://www.pcmag.com/article2/0,2817,2470084,00.asp
- http://www.pcmag.com/article2/0,2817,2470087,00.asp
...
- http://www.pcmag.com/article2/0,2817,2475543,00.asp
- http://www.pcmag.com/article2/0,2817,2475409,00.asp
- http://www.pcmag.com/article2/0,2817,2475359,00.asp
如何获取span
文本示例:
function getSpanTexts() {
var texts = document.querySelectorAll('ul.videos li span.video-title');
return Array.prototype.map.call(texts, function(e) {
return e.textContent;
});
}
答案 1 :(得分:0)
这是部分解决方案(我不知道Casper),但想法是替换
this.getElementsInfo('a')
实际上通过类ul
获取videos
本身。然后你必须做逻辑导航到li
,然后导航a
并拉出链接。即,不要刮掉链接;抓取你感兴趣的列表元素,并可以按类识别,然后从中删除链接元素。