Casper刮取多个加载在一个基于javascript的页面

时间:2016-09-14 21:46:27

标签: javascript casperjs

循环链接不允许从其网站/数据库抓取到其他数据库这仅用于动态加载页面中无头浏览器测试的学术目的。请不要滥用他们的数据。

基本上,我只是尝试从多个页面获取图像链接,但页面是通过java脚本动态加载的。所以我最终尝试在 phantomjs 上构建无头浏览器 casperjs

现在我的问题是函数getThumbNails()输出两次相同的数据。这是输出的日志,请注意,第一个“1”和第一个“2”是相同的链接

PuffMagicDragon@SuperDankLinux:~/WeB$ casperjs --web-security=no --cookies-file=/tmp/mycookies.txt 9Cas.js 
First Page Is Loaded
Second Page Is Loaded
1 http://x.lnimg.com/photo/thumb_480/40f175f108f5492b9cdec6486d753f8d.jpg
1 http://x.lnimg.com/photo/thumb_480/29081ed96a6349a08c27424ce3bd2842.jpg
1 http://x.lnimg.com/photo/thumb_480/29cd278e7cc34d9782d0a22782af2134.jpg
1 http://x.lnimg.com/photo/thumb_480/3979dc0f0987407bb9f825f2a0cb3fa9.jpg
1 http://x.lnimg.com/photo/thumb_480/dd06239abbf1433099ad3278607e5d7f.jpg
1 http://x.lnimg.com/photo/thumb_480/6e7c6b6076d5414b8ee59baed3dc3131.jpg
1 http://x.lnimg.com/photo/thumb_480/97027946bbf745a59d44ac1c3e9d22fe.jpg
1 http://x.lnimg.com/photo/thumb_480/396fd224e85f42aea7a10e1873ed627c.jpg
1 http://x.lnimg.com/photo/thumb_480/62f6afc3f50942388df9fe66e99a2ab4.jpg
1 http://x.lnimg.com/photo/thumb_480/09ba5b97da4e47b1a97bac86e125001d.jpg
1 http://x.lnimg.com/photo/thumb_480/580dcc66cd7f48d8aae8f583cd8e5e4b.jpg
1 http://x.lnimg.com/photo/thumb_480/e62f6d11449b41ff93e191f6045cb304.jpg
1 http://x.lnimg.com/photo/thumb_480/596230f07ebf471383991a99bd43420a.jpg
1 http://x.lnimg.com/photo/thumb_480/da63be695fed4617b594d19e4aa0dc7a.jpg
1 http://x.lnimg.com/photo/thumb_480/58f28244a2494c868006a97534e694e6.jpg
1 http://x.lnimg.com/photo/thumb_480/1b0246dfe4314404b32147061198cf80.jpg
1 http://x.lnimg.com/photo/thumb_480/b10a9bb8252f4d3d9638284f6f68fd6b.jpg
1 http://x.lnimg.com/photo/thumb_480/48bf185da5f74a369dd629dd183bc8b3.jpg
1 http://x.lnimg.com/photo/thumb_480/7008392330f847ecb3af7058ce0b2e3b.jpg
1 http://x.lnimg.com/photo/thumb_480/fcf5618fc7d04265a912cce9f7dec344.jpg
1 //x.lnimg.com/images/search/map/pinprofileclose2.gif
1 
1 //x.lnimg.com/images/search/map/pinprofile-ViewDetails.gif
1 http://ak.t0.tiles.virtualearth.net/tiles/r0?g=5289&mkt=en-us&lbl=l0&stl=h&shading=hill&n=z
1 http://ak.dynamic.t1.tiles.virtualearth.net/comp/ch/0231?mkt=en-us&it=G,L&shading=hill&og=132&n=z&key=AhfWRPlwZlVGewL9th5vsIklmzvMkUMm8wx6pz5nXzgDlYXliZFSr6lBvgHYCZQj
2 http://x.lnimg.com/photo/thumb_480/40f175f108f5492b9cdec6486d753f8d.jpg
2 http://x.lnimg.com/photo/thumb_480/29081ed96a6349a08c27424ce3bd2842.jpg
2 http://x.lnimg.com/photo/thumb_480/29cd278e7cc34d9782d0a22782af2134.jpg
2 http://x.lnimg.com/photo/thumb_480/3979dc0f0987407bb9f825f2a0cb3fa9.jpg
2 http://x.lnimg.com/photo/thumb_480/dd06239abbf1433099ad3278607e5d7f.jpg
2 http://x.lnimg.com/photo/thumb_480/6e7c6b6076d5414b8ee59baed3dc3131.jpg
2 http://x.lnimg.com/photo/thumb_480/97027946bbf745a59d44ac1c3e9d22fe.jpg
2 http://x.lnimg.com/photo/thumb_480/396fd224e85f42aea7a10e1873ed627c.jpg
2 http://x.lnimg.com/photo/thumb_480/62f6afc3f50942388df9fe66e99a2ab4.jpg
2 http://x.lnimg.com/photo/thumb_480/09ba5b97da4e47b1a97bac86e125001d.jpg
2 http://x.lnimg.com/photo/thumb_480/580dcc66cd7f48d8aae8f583cd8e5e4b.jpg
2 http://x.lnimg.com/photo/thumb_480/e62f6d11449b41ff93e191f6045cb304.jpg
2 http://x.lnimg.com/photo/thumb_480/596230f07ebf471383991a99bd43420a.jpg
2 http://x.lnimg.com/photo/thumb_480/da63be695fed4617b594d19e4aa0dc7a.jpg
2 http://x.lnimg.com/photo/thumb_480/58f28244a2494c868006a97534e694e6.jpg
2 http://x.lnimg.com/photo/thumb_480/1b0246dfe4314404b32147061198cf80.jpg
2 http://x.lnimg.com/photo/thumb_480/b10a9bb8252f4d3d9638284f6f68fd6b.jpg
2 http://x.lnimg.com/photo/thumb_480/48bf185da5f74a369dd629dd183bc8b3.jpg
2 http://x.lnimg.com/photo/thumb_480/7008392330f847ecb3af7058ce0b2e3b.jpg
2 http://x.lnimg.com/photo/thumb_480/fcf5618fc7d04265a912cce9f7dec344.jpg
2 //x.lnimg.com/images/search/map/pinprofileclose2.gif
2 
2 //x.lnimg.com/images/search/map/pinprofile-ViewDetails.gif
2 http://ak.t0.tiles.virtualearth.net/tiles/r0?g=5289&mkt=en-us&lbl=l0&stl=h&shading=hill&n=z
2 http://ak.dynamic.t1.tiles.virtualearth.net/comp/ch/0231?mkt=en-us&it=G,L&shading=hill&og=132&n=z&key=AhfWRPlwZlVGewL9th5vsIklmzvMkUMm8wx6pz5nXzgDlYXliZFSr6lBvgHYCZQj

以下是文档说评估工作的方式:Casperjs Evaluate Diagram

这是我正在抓的网站:http://looplink.ensemblere.com/SearchResults

我使用以下命令在linux上通过bash运行脚本:

casperjs --web-security=no --cookies-file=/tmp/mycookies.txt Script.js
var casper = require('casper').create({
    verbose: true,
});

//two different vars for two different pages of images
var thumbNails, thumbNails2;

function getThumbNails() {
// Function to Scrape the links of thumbnails
    var thumbNails = document.querySelectorAll('img');
    return Array.prototype.map.call(thumbNails, function (e) {
        return e.getAttribute('src');
    });
};

// Scrubs target page below
casper.start('http://looplink.ensemblere.com/SearchResults');

//Runs the function on page one, below
casper.then(function () {
    thumbNails = this.evaluate(getThumbNails);
});

//Confirm that we are on page one of website, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loaded');
    }
    else {
        this.echo('First Page Is Loaded');
    }
});

//Click on the next button to go to page two, below
casper.thenClick('a.searchPagingButton.pagingNextButton');

//Confirm that we are on page two, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loaded')
    }
    else {
        this.echo('First Page Is Loaded')
    }
});

//Runs the funciton for the second time, below; BUT HERE IS THE ISSUE *****
//idk why it pulls the same data twice, when the then statement above confirms we are on page two.
casper.then(function () {
    thumbNails2 = this.evaluate(getThumbNails);
});

//Sort the data from the varaiables.
casper.run(function () {
    for(var i in thumbNails) {
        console.log('1 ' + thumbNails[i]);
    }
    for(var i in thumbNails2) {
        console.log('2 ' + thumbNails2[i]);
    }
    this.done();
});

1 个答案:

答案 0 :(得分:0)

好的,所以在评论中支持 Artjom B。,告诉我截图以确保页面已加载!

资源只是加载即:still loading picture

添加

this.wait(40000, function() {
    this.echo('Waited for 40 seconds');
});

casper将等待40秒

对于屏幕截图调试,请尝试this.capture(/home/SuperDankDude/yourfilename.png)

我还注意到该网站的移动版本与全屏版本不同,所以我还必须使用casper.options.viewportSize = {width: 1920, height: 1080};

var casper = require('casper').create({
    verbose: true,
});

//two different vars for two different pages of images
var thumbNails, thumbNails2;

casper.options.viewportSize = {width: 1920, height: 1080};

function getThumbNails() {
// Function to Scrape the links of thumbnails
    var thumbNails = document.querySelectorAll('img');
    return Array.prototype.map.call(thumbNails, function (e) {
        return e.getAttribute('src');
    });
};

// Scrubs target page below
casper.start('http://looplink.ensemblere.com/SearchResults');

//Runs the function on page one, below
casper.then(function () {
    thumbNails = this.evaluate(getThumbNails);
});

//Confirm that we are on page one of website, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loaded');
    }
    else {
        this.echo('First Page Is Loaded')
        this.capture("/home/votlon/WeB/firstpage.png");
    }
});

//Click on the next button to go to page two, below
casper.thenClick('a.searchPagingButton.pagingNextButton');

//Confirm that we are on page two, below
casper.then(function (){
    if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
        this.echo('Second Page Is Loading')
        this.wait(40000, function() {
        this.echo('Waited for 40 seconds');
    });
        this.capture("/home/votlon/WeB/secondpage.png");
    }
    else {
        this.echo('First Page Is Loaded')
    }
});

//Runs the funciton for the second time, below; BUT HERE IS THE ISSUE *****
//idk why it pulls the same data twice, when the then statement above confirms we are on page two.
casper.then(function () {
    thumbNails2 = this.evaluate(getThumbNails);
});

//Sort the data from the varaiables.
casper.run(function () {
    for(var i in thumbNails) {
        console.log('1 ' + thumbNails[i]);
    }
    for(var i in thumbNails2) {
        console.log('2 ' + thumbNails2[i]);
    }
    this.done();
});