使用casperJS从网页上抓取一些链接

时间:2016-08-12 16:13:51

标签: javascript phantomjs casperjs

我正在使用casperJS 1.1.2和phantomJS 2.1.1从网页中检索一些链接。我感兴趣的链接都在href属性中有字符串“javascript”,如下所示:

<td>
<a href="javascript:WebForm_DoPostBackWithOptions(new 
WebForm_PostBackOptions(&quot;ctl00$CenterContent$ctl01&quot;,
&quot;&quot;, true, &quot;&quot;, &quot;&quot;, false,
true))">Species A    
</a></td>
<td>
<a href="javascript:WebForm_DoPostBackWithOptions(new
WebForm_PostBackOptions(&quot;ctl00$CenterContent$ctl02&quot;,
&quot;&quot;, true, &quot;&quot;, &quot;&quot;, false,
true))">Species B   </a></td>
<td><a href="javascript:WebForm_DoPostBackWithOptions(new
WebForm_PostBackOptions(&quot;ctl00$CenterContent$ctl03&quot;,
&quot;&quot;, true, &quot;&quot;, &quot;&quot;, false,
true))">Sepcies C    </a></td>
<td>
<a href="javascript:WebForm_DoPostBackWithOptions(new
WebForm_PostBackOptions(&quot;ctl00$CenterContent$ctl04&quot;,
&quot;&quot;, true, &quot;&quot;, &quot;&quot;, false,
true))">Species D</a></td>
<td>
<a href="javascript:WebForm_DoPostBackWithOptions(new
WebForm_PostBackOptions(&quot;ctl00$CenterContent$ctl05&quot;,
&quot;&quot;, true, &quot;&quot;, &quot;&quot;, false,
true))">Species E    </a></td>

我在casperJS中写了一些脚本来刮掉所有并写入文件所有href属性包含“javascript”字符串的链接,如下所示。

var links=[];
var casper = require('casper').create({
  waitTimeout: 10000,
    verbose: true,
    logLevel: 'debug',
    pageSettings: {
        loadImages: false,
        loadPlugins: false
    }
});

var fs = require('fs');

casper.start("https://apps.ams.usda.gov/CMS/", function()
    {
       links = _utils_.getElementsByXPath('.//td/a[contains(@href,"javascript")]');
    });

fs.write("plantVarietyResults.json", links, 'w');


casper.run();

我无法弄清楚为什么我的脚本没有正确写入文件的链接。

1 个答案:

答案 0 :(得分:0)

CasperJS存在一些误解,代码中存在错误:

以下是一个应该有效的示例(未经测试):

casper.start("https://apps.ams.usda.gov/CMS/", function() {
    var links = this.evaluate(function(){
        return __utils__.getElementsByXPath('.//td/a[contains(@href,"javascript")]')
            .map(function(element){
                return element.href;
            });
    });
    fs.write("plantVarietyResults.json", JSON.stringify(links), 'w');
});

casper.run();

这是一个略短的方式:

var x = require('casper').selectXPath;
casper.start("https://apps.ams.usda.gov/CMS/", function() {
    var links = this.getElementsAttribute(x('.//td/a[contains(@href,"javascript")]'), 'href');
    fs.write("plantVarietyResults.json", JSON.stringify(links), 'w');
});

casper.run();