我正试图转到this page,并从每个链接中删除“标题”'和作者'对于每篇论文。到目前为止,我有这个(我需要帮助的问题在代码中的注释中):
var utils = require('utils');
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
},
clientScripts: ['lib/jquery.min.js']
});
var i = 0;
var links = [];
var thesis_data = [];
function getThesisLinks () {
var links = document.querySelectorAll('');//Not sure what should go in ('')
return [].map.call(links, function(link) {
return link.getAttribute('href');
});
}
function loopThroughThesisLinks() {
// Recurses until all links are processed
if (i < links.length) {
this.echo('[LINK #' + i + '] ' + links[i]);
getThesisData.call(this, links[i]);
i++;
this.run(loopThroughThesisLinks);
} else {
utils.dump(thesis_data);
this.exit();
}
}
function getThesisData(link) {
this.start(link, function() {
// Get title of thesis - not sure what element to insert for this.fetchText
var title = this.fetchText('');
// Get name of authors - not sure what element to insert for this.fetchText
var author = this.fetchText('');
// Add the title & author data to the thesis_data array
var data = {
title: title,
author: author
};
thesis_data.push(data);
});
}
casper.start('http://ses.library.usyd.edu.au/handle/2123/345/browse?type=dateissued&sort_by=2&order=DESC&rpp=1495&etal=0&submit_browse=Update', function() {
links = this.evaluate(getThesisLinks);
// Convert relative links to absolute URLs
for (var i = 0; i < links.length; i++) {
links[i] = "http://ses.library.usyd.edu.au/handle/" + links[i];
}
utils.dump(links);
});
casper.run(loopThroughThesisLinks);
任何帮助都将不胜感激。
答案 0 :(得分:1)
这是所有链接的简单CSS选择器:
var links = document.querySelectorAll(
'table.misctable > tbody > tr > td:nth-of-type(3) > a');
你也可以像这样使用XPath:
var x = require('casper').selectXPath; // goes to the beginning of the file
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));
我认为你可以找出 authors -query。我可能会在循环中使用casper.thenOpen
以不同的方式完成抓取,因为在其他start
和run
调用处于不同的函数中时,这很难阅读。
使用casper.thenOpen
,它看起来像这样:
var x = require('casper').selectXPath; // goes to the beginning of the file
function loopThroughThesisLinks() {
// Recurses until all links are processed
if (i < links.length) {
this.echo('[LINK #' + i + '] ' + links[i]);
getThesisData.call(this, links[i]);
i++;
this.then(loopThroughThesisLinks);
} else {
utils.dump(thesis_data);
this.exit();
}
}
function getThesisData(link) {
this.thenOpen(link, function() {
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));
var author = this.fetchText(x('//table//tr/td[1][contains(text(),"Authors:")]/../td[2]'));
// Add the title & author data to the thesis_data array
var data = {
title: title,
author: author
};
thesis_data.push(data);
});
}