继续question之后,我正在尝试使用phantomjs来抓取数据,从here修改脚本:
我的目标是在第一个代码段中将一个工作函数(请参阅第二个代码段)集成到下面的脚本中。我试过这样做,但一直都是错误。有没有办法可以实际进行整合?
(注意:使用phantomjs因为该网站是一个角度应用程序,其中初始HTML不包含我正在寻找的任何数据,即无头网页浏览器。所以我需要在内存中加载页面,等待角度做它的东西(某种延迟),然后刮掉渲染的DOM)
执行脚本时我得到的错误(和输出)(phantomjs scraping.js)如下:
控制台> SPR-ERROR:103 - 无效的发布日期控制台> V6 ReferenceError:找不到变量:angular
http://stage.inc.com/js/Inc5000ListApp.js?UPDATE1:2
http://www.inc.com/inc5000/index.html:2485
控制台> SPR-ERROR:103 - 无效的公布日期(日期)
================================================== ==步骤“0”
=============================================== =====
控制台>达到了scrapeData控制台>
似乎它正在连接到所需的网站。如何修改下面的脚本以适应此qn:
底部的提取代码var page = new WebPage(),
url = 'http://www.inc.com/inc5000/index.html',
stepIndex = 0;
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript console. The callback may accept up to three arguments:
* the string for the message, the line number, and the source identifier.
*/
page.onConsoleMessage = function (msg, line, source) {
console.log('console> ' + msg);
};
/**
* From PhantomJS documentation:
* This callback is invoked when there is a JavaScript alert. The only argument passed to the callback is the string for the message.
*/
page.onAlert = function (msg) {
console.log('alert!!> ' + msg);
};
// Callback is executed each time a page is loaded...
page.open(url, function (status) {
if (status === 'success') {
// State is initially empty. State is persisted between page loads and can be used for identifying which page we're on.
console.log('============================================');
console.log('Step "' + stepIndex + '"');
console.log('============================================');
// Inject jQuery for scraping (you need to save jquery-1.6.1.min.js in the same folder as this file)
page.injectJs('jquery-1.6.1.min.js');
// Our "event loop"
if(!phantom.state){
//initialize();
scrapeData();
} else {
phantom.state();
}
// Save screenshot for debugging purposes
page.render("step" + stepIndex++ + ".png");
}
});
function scrapeData(){
page.evaluate(function() {
console.log('Reached scrapeData');
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});
phantom.state = parseResults;
// scraping code here
}
// Step 1
function initialize() {
page.evaluate(function() {
console.log('Searching...');
});
// Phantom state doesn't change between page reloads
// We use the state to store the search result handler, ie. the next step
phantom.state = parseResults;
}
// Step 2
function parseResults() {
page.evaluate(function() {
$('#search-result a').each(function(index, link) {
console.log($(link).attr('href'));
})
console.log('Parsed results');
});
// If there was a 3rd step we could point to another function
// but we would have to reload the page for the callback to be called again
phantom.exit();
}
我知道下面的代码在控制台中有效,但我如何将其与上面的代码脚本集成以成功从网站上的多个页面中抓取数据:
request('http://www.inc.com/inc5000/index.html', function (error, response, html) {
if(error || response.statusCode != 200) return;
var $ = cheerio.load(html);
var DATA = [];
$('tr.ng-scope').each(function(){
var $tds = $(this).find('td');
DATA.push({
rank: $tds.eq(0).text(),
company: $tds.eq(1).text(),
growth: $tds.eq(2).text(),
revenue: $tds.eq(3).text(),
industry: $tds.eq(4).text()
});
});
console.log(DATA);
});