我在尝试编写一个相当简单的CasperJS刮刀时遇到了一些严重的问题。基本上,我想在网站上遍历一些搜索结果,跟踪每个结果,收集一些数据,然后返回当前搜索页面。完成此过程后,我想将结果写入文件。我有以下代码根本无法正常工作。请原谅任何明显的错误,我是javascript的新手,是Java,Ruby,C ++原生。
// This site can also be queried via URL, I initially wrote this serializer
// to use this approach, but I ended up going with CasperJS navigation instead.
// My problems seem agnostic to whether or not I navigate using page links or URL.
function serialize(json) {
var str = [];
for(var prop in json) {
if(json.hasOwnProperty(prop)) {
str.push(encodeURIComponent(prop) + "=" + encodeURIComponent(obj[prop]));
}
}
return str.join("&");
}
// Scrape Links and Names from the current page in the searh results
function getPageLinks() {
var dancers = document.querySelectorAll('h4 > a');
return Array.prototype.map.call(links, function(e) {
var result = {};
result[e.textContent] = e.getAttribute('href');
return result;
});
}
// For a given dancer, scrape the block of html containing the name of each donor,
// their donation amount, and any comments.
function scrapeDonorInfo() {
var donors = document.querySelectorAll('div.msgBottomInnCont > div.meta');
return Array.prototype.map.call(links, function(e) {
return e.innerHtml;
});
}
// Use Tail recursion to scrape the donors for every dancer in each page of the search results.
function scrapeAllDonors(dancers, startIndex) {
// Inject Underscore.js for utility methods (namely _.union())
this.page.injectJs('https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js');
// Populate the links object only after there are links to scrape
casper.waitForSelector('h4 > a', function() {
var links = this.evaluate(getPageLinks);
dancers = this.evaluate(_.union(dancers, links));
});
this.echo('Links object populated', 'INFO'); // Log the message,
// using this.echo() for colored tags
// For every dancer page link on this page of search results,
// fetch their fundraising page, scrape their donors,
//
dancers.forEach(function(element, index, array) {
if(index >= startIndex) {
var name = Object.keys(element)[0];
var link = baseURL + element[name];
casper.thenOpen(link);
casper.waitForSelector('div.meta', function() {
var viewMore = 'a.viewMore';
if(casper.visible(viewMoreActivity)) {
casper.thenClick(viewMore);
}
element[name] = {"donor_info": this.evaluate(getDonorInfo)};
});
casper.back();
}
});
var nextLink = "a#next";
casper.waitForSelector(nextLink, function() {
// If the next button in the results is clickable, click it.
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.thenEvaluate(scrapeAllDonors(dancers, dancers.length()));
} else {
// Otherwise, write the final results to file.
fs.write(save, dancers, 'w');
casper.echo("END")
}
});
}
// Note: This is the Phantom.js package 'fs', not the Node.js package.
var fs = require('fs');
// Create a dated file for scrape results
var fname = new Date().getTime() + '.txt';
var save = fs.pathJoin(fs.workingDirectory, 'data', fname);
// Initialize Casper.js with desired settings
var casper = require('casper').create({
verbose: true,
logLevel: 'debug',
pageSettings: {
loadImages: false,
loadPlugins: false
}
});
// Handler for Resource Errors
casper.on("resource.error", function(resourceError) {
console.log('Unable to load resource (#' + resourceError.id + 'URL:' + resourceError.url + ')');
console.log('Error code: ' + resourceError.errorCode + '. Description: ' + resourceError.errorString);
});
// Handler for Page Errors
casper.on("page.error", function (msg, trace) {
console.log( 'Error: ' + msg, 'ERROR' );
console.log( 'Trace: ' + trace, 'TRACE' );
});
// Handler for Blocking requests made by social components (facebook in particular)
casper.on("resource.requested", function(requestData, networkRequest){
console.log('Request (#' + requestData.id + '): ' + JSON.stringify(requestData) + "\n");
if (requestData.url.indexOf("facebook") !== -1) {
networkRequest.abort();
}
});
// BaseURL for the site, convenient for scrapeAllDonors
var baseURL = 'https://fundraise.nudm.org/';
casper.start('https://fundraise.nudm.org/search/fundraisers?page=1');
casper.then(scrapeAllDonors([], 0));
// Run everything in the stack, then notify and exit
casper.run(function() {
this.echo("DONE", 'INFO');
this.exit();
});
为了解决问题,Casper / Phantom拒绝打印任何日志消息,我无法弄清楚原因。当我没有调试运行时,我得到:
casperjs --ssl-protocol=tlsv1 Crawler.js
[info] [phantom] Starting...
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.
启用调试后,我得到:
casperjs --ssl-protocol=tlsv1 -debug=true Crawler.js
Unable to open file: -debug=true
Unable to load script -debug=true; check file syntax
dhcp-199-74-85-154:NUDM Expose williambyrne$ casperjs --ssl-protocol=tlsv1 --debug=true Crawler.js
2016-03-06T14:22:31 [DEBUG] CookieJar - Created but will not store cookies (use option '--cookies-file=<filename>' to enable persisten cookie storage)
2016-03-06T14:22:31 [DEBUG] Phantom - execute: Configuration
2016-03-06T14:22:31 [DEBUG] 0 objectName : ""
2016-03-06T14:22:31 [DEBUG] 1 cookiesFile : ""
2016-03-06T14:22:31 [DEBUG] 2 diskCacheEnabled : "false"
2016-03-06T14:22:31 [DEBUG] 3 maxDiskCacheSize : "-1"
2016-03-06T14:22:31 [DEBUG] 4 ignoreSslErrors : "false"
2016-03-06T14:22:31 [DEBUG] 5 localToRemoteUrlAccessEnabled : "false"
2016-03-06T14:22:31 [DEBUG] 6 outputEncoding : "UTF-8"
2016-03-06T14:22:31 [DEBUG] 7 proxyType : "http"
2016-03-06T14:22:31 [DEBUG] 8 proxy : ":1080"
2016-03-06T14:22:31 [DEBUG] 9 proxyAuth : ":"
2016-03-06T14:22:31 [DEBUG] 10 scriptEncoding : "UTF-8"
2016-03-06T14:22:31 [DEBUG] 11 webSecurityEnabled : "true"
2016-03-06T14:22:31 [DEBUG] 12 offlineStoragePath : ""
2016-03-06T14:22:31 [DEBUG] 13 offlineStorageDefaultQuota : "-1"
2016-03-06T14:22:31 [DEBUG] 14 printDebugMessages : "true"
2016-03-06T14:22:31 [DEBUG] 15 javascriptCanOpenWindows : "true"
2016-03-06T14:22:31 [DEBUG] 16 javascriptCanCloseWindows : "true"
2016-03-06T14:22:31 [DEBUG] 17 sslProtocol : "tlsv1"
2016-03-06T14:22:31 [DEBUG] 18 sslCertificatesPath : ""
2016-03-06T14:22:31 [DEBUG] 19 webdriver : ":"
2016-03-06T14:22:31 [DEBUG] 20 webdriverLogFile : ""
2016-03-06T14:22:31 [DEBUG] 21 webdriverLogLevel : "INFO"
2016-03-06T14:22:31 [DEBUG] 22 webdriverSeleniumGridHub : ""
2016-03-06T14:22:31 [DEBUG] Phantom - execute: Script & Arguments
2016-03-06T14:22:31 [DEBUG] script: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js"
2016-03-06T14:22:31 [DEBUG] 0 arg: "--casper-path=/usr/local/Cellar/casperjs/1.1-beta4/libexec"
2016-03-06T14:22:31 [DEBUG] 1 arg: "--cli"
2016-03-06T14:22:31 [DEBUG] 2 arg: "Crawler.js"
2016-03-06T14:22:31 [DEBUG] Phantom - execute: Starting normal mode
2016-03-06T14:22:31 [DEBUG] WebPage - setupFrame ""
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/fs.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/system.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/_coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/package.json" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./lexer.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././rewriter.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././helpers.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./parser.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./helpers.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./nodes.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././scope.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./././helpers.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/././lexer.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/../coffee-script/./lib/coffee-script/./././rewriter.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/webpage.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/package.json" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/cli.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/utils.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] Phantom - injectJs: "Crawler.js"
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/casper.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/colorizer.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/events.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/http.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/mouse.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/pagestack.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/querystring.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: "/usr/local/Cellar/casperjs/1.1-beta4/libexec/modules/tester.js" QMap(("mode", QVariant(QString, "r") ) )
[info] [phantom] Starting...
2016-03-06T14:22:31 [DEBUG] WebpageCallbacks - getJsConfirmCallback
2016-03-06T14:22:31 [DEBUG] WebpageCallbacks - getGenericCallback
2016-03-06T14:22:31 [DEBUG] WebpageCallbacks - getJsConfirmCallback
2016-03-06T14:22:31 [DEBUG] WebPage - setupFrame ""
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/fs.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/system.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/_coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/webpage.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 10
2016-03-06T14:22:31 [DEBUG] WebPage - setupFrame ""
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/fs.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/system.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/_coffee-script.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] FileSystem - _open: ":/modules/webpage.js" QMap(("mode", QVariant(QString, "r") ) )
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 100
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 10
2016-03-06T14:22:31 [DEBUG] WebPage - updateLoadingProgress: 100
有什么想法吗?
更新(在做出第一个建议的更改后)
williambyrne$ casperjs --ssl-protocol=tlsv1 Crawler.js
[info] [phantom] Starting...
[info] [phantom] Running suite: 3 steps
[debug] [phantom] opening url: https://fundraise.nudm.org/search/fundraisers?page=1, HTTP GET
[debug] [phantom] Navigation requested: url=https://fundraise.nudm.org/search/fundraisers?page=1, type=Other, willNavigate=true, isMainFrame=true
Request (#1): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"}],"id":1,"method":"GET","time":"2016-03-06T21:03:49.874Z","url":"https://fundraise.nudm.org/search/fundraisers?page=1"}
[debug] [phantom] url changed to "https://fundraise.nudm.org/search/fundraisers?page=1"
Request (#2): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":2,"method":"GET","time":"2016-03-06T21:03:51.112Z","url":"https://fundraise.nudm.org/css/sc_global.css?cuiv=1456860159443"}
Request (#3): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":3,"method":"GET","time":"2016-03-06T21:03:51.113Z","url":"https://fundraise.nudm.org/stylesheets/css/charity/search.css?cuiv=1456860159443"}
Request (#4): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":4,"method":"GET","time":"2016-03-06T21:03:51.113Z","url":"https://fundraise.nudm.org/css/white_label_header_v3.4.3.1.css?cuiv=1456860159443"}
Request (#5): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"text/css,*/*;q=0.1"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":5,"method":"GET","time":"2016-03-06T21:03:51.114Z","url":"https://fundraise.nudm.org/css/white_label_header_responsive.css?cuiv=1456860159443"}
Request (#6): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":6,"method":"GET","time":"2016-03-06T21:03:51.114Z","url":"https://ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js"}
Request (#7): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":7,"method":"GET","time":"2016-03-06T21:03:51.114Z","url":"https://fundraise.nudm.org/js/front_scripts.js?cuiv=1456860159443"}
Request (#8): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":8,"method":"GET","time":"2016-03-06T21:03:51.115Z","url":"https://fundraise.nudm.org/js/mobile_share.js?cuiv=1456860159443"}
Request (#9): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":9,"method":"GET","time":"2016-03-06T21:03:51.115Z","url":"https://fundraise.nudm.org/js/search.js?cuiv=1456860159443"}
Request (#10): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":10,"method":"GET","time":"2016-03-06T21:03:51.116Z","url":"https://fundraise.nudm.org/js/mobile.js?cuiv=1456860159443"}
Request (#11): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":11,"method":"GET","time":"2016-03-06T21:03:51.304Z","url":"https://ssl.google-analytics.com/ga.js"}
Request (#12): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":12,"method":"GET","time":"2016-03-06T21:03:51.304Z","url":"https://www.google-analytics.com/analytics.js"}
Request (#13): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":13,"method":"GET","time":"2016-03-06T21:03:51.309Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Reg-webfont.woff"}
Request (#14): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":14,"method":"GET","time":"2016-03-06T21:03:51.313Z","url":"https://connect.facebook.com/en_US/sdk.js"}
Request (#15): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":15,"method":"GET","time":"2016-03-06T21:03:51.314Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Sbold-webfont.woff"}
Request (#16): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":16,"method":"GET","time":"2016-03-06T21:03:51.315Z","url":"https://fundraise.nudm.org/css/fonts/pictos/pictos-webfont.woff"}
Request (#17): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":17,"method":"GET","time":"2016-03-06T21:03:51.315Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Bold-webfont.woff"}
Request (#18): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":18,"method":"GET","time":"2016-03-06T21:03:51.316Z","url":"https://fundraise.nudm.org/css/fonts/proximanova/ProximaNova-Thin-webfont.woff"}
Request (#19): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"},{"name":"Accept","value":"*/*"}],"id":19,"method":"GET","time":"2016-03-06T21:03:51.317Z","url":"https://fundraise.nudm.org/css/fonts/entypo/entypo.woff"}
Unable to load resource (#14URL:)
Error code: 301. Description: Protocol "" is unknown
Request (#20): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":20,"method":"GET","time":"2016-03-06T21:03:51.796Z","url":"https://js-agent.newrelic.com/nr-885.min.js"}
Request (#21): {"headers":[{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X) AppleWebKit/534.34 (KHTML, like Gecko) CasperJS/1.1.0-beta4+PhantomJS/1.9.8 Safari/534.34"},{"name":"Accept","value":"*/*"},{"name":"Referer","value":"https://fundraise.nudm.org/search/fundraisers?page=1"}],"id":21,"method":"GET","time":"2016-03-06T21:03:53.756Z","url":"https://bam.nr-data.net/1/67fe2a1b26?a=10291124&v=885.a559836&to=ZV0HYUJUCEYEU0QLC1wXJFZEXAlbSlRVBAVHVBEaQ1AHRwZYHwQRXFwXVFlGA0cW&rst=2645&ap=775&fe=686&dc=204&f=%5B%5D&at=SRoEFwpOG0g%3D&jsonp=NREUM.setToken"}
[debug] [phantom] Successfully injected Casper client-side utilities
[debug] [phantom] start page is loaded
[info] [phantom] Step anonymous 3/3 https://fundraise.nudm.org/search/fundraisers?page=1 (HTTP 200)
Links object populated
[info] [phantom] Step anonymous 3/3: done in 3944ms.
[info] [phantom] Step _step 4/5 https://fundraise.nudm.org/search/fundraisers?page=1 (HTTP 200)
[info] [phantom] Step _step 4/5: done in 3965ms.
[info] [phantom] waitFor() finished in 40ms.
[info] [phantom] Step anonymous 5/6 https://fundraise.nudm.org/search/fundraisers?page=1 (HTTP 200)
Error: ReferenceError: Can't find variable: links
Trace: [object Object],[object Object],[object Object]
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.
Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/Cellar/casperjs/1.1-beta4/libexec/bin/bootstrap.js. Domains, protocols and ports must match.
似乎“链接”的范围存在一些问题。阵列。
更新2 :(对scrapeAllDonors
的更改)
// Use Tail recursion to scrape the donors for every dancer in each page of the search results.
function scrapeAllDonors(dancers, startIndex) {
// Inject Underscore.js for utility methods (namely _.union())
this.page.injectJs('https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js');
// Populate the links object only after there are links to scrape
casper.waitForSelector('h4 > a', function() {
var links = this.evaluate(getPageLinks);
dancers = this.evaluate(_.union(dancers, links));
// For every dancer page link on this page of search results,
// fetch their fundraising page, scrape their donors,
//
dancers.forEach(function(element, index, array) {
if(index >= startIndex) {
var name = Object.keys(element)[0];
var link = baseURL + element[name];
casper.thenOpen(link);
casper.waitForSelector('div.meta', function(name) {
var viewMore = 'a.viewMore';
if(casper.visible(viewMoreActivity)) {
casper.thenClick(viewMore);
}
element[name] = {"donor_info": this.evaluate(getDonorInfo)};
}, name);
casper.back();
}
});
// If the next button in the results is clickable, click it.
var nextLink = "a#next";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(function() {
scrapeAllDonors.call(this, dancers, dancers.length());
});
} else {
// Otherwise, write the final results to file.
fs.write(save, dancers, 'w');
casper.echo("END")
}
});
this.echo('Donor Information Scraped', 'INFO'); // Log the message,
// using this.echo() for colored tags
}
答案 0 :(得分:1)
您已经犯了立即调用scrapeAllDonors
的错误,而不是稍后将其传递执行,此处:
casper.thenEvaluate(scrapeAllDonors(dancers, dancers.length()));
在这里:
casper.then(scrapeAllDonors([], 0));
这意味着它甚至在加载第一页之前执行,因此尝试对about:blank进行操作。如果你想这样调用它,你需要重构scrapeAllDonors
,以便它返回一个步进函数:
function scrapeAllDonors(dancers, startIndex) {
return function(){
// Inject Underscore.js for utility methods (namely _.union())
this.page.injectJs('https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.8.3/underscore-min.js');
// ...
var nextLink = "a#next";
casper.waitForSelector(nextLink, function() {
// ...
});
};
}
如果您不想更改scrapeAllDonors
,则可以通过替换
casper.then(scrapeAllDonors(...));
与
casper.then(function(){
scrapeAllDonors.call(this, ...)
});
我对What must be wrapped in then() statements in CasperJS? How to determine execution order of sync/async functions?的回答可能有助于理解CasperJS中异步执行的复杂性。