循环链接不允许从其网站/数据库抓取到其他数据库。 这仅用于动态加载页面中无头浏览器测试的学术目的。请不要滥用他们的数据。
基本上,我只是尝试从多个页面获取图像链接,但页面是通过java脚本动态加载的。所以我最终尝试在 phantomjs 上构建无头浏览器 casperjs 。
现在我的问题是函数getThumbNails()输出两次相同的数据。这是输出的日志,请注意,第一个“1”和第一个“2”是相同的链接。
PuffMagicDragon@SuperDankLinux:~/WeB$ casperjs --web-security=no --cookies-file=/tmp/mycookies.txt 9Cas.js
First Page Is Loaded
Second Page Is Loaded
1 http://x.lnimg.com/photo/thumb_480/40f175f108f5492b9cdec6486d753f8d.jpg
1 http://x.lnimg.com/photo/thumb_480/29081ed96a6349a08c27424ce3bd2842.jpg
1 http://x.lnimg.com/photo/thumb_480/29cd278e7cc34d9782d0a22782af2134.jpg
1 http://x.lnimg.com/photo/thumb_480/3979dc0f0987407bb9f825f2a0cb3fa9.jpg
1 http://x.lnimg.com/photo/thumb_480/dd06239abbf1433099ad3278607e5d7f.jpg
1 http://x.lnimg.com/photo/thumb_480/6e7c6b6076d5414b8ee59baed3dc3131.jpg
1 http://x.lnimg.com/photo/thumb_480/97027946bbf745a59d44ac1c3e9d22fe.jpg
1 http://x.lnimg.com/photo/thumb_480/396fd224e85f42aea7a10e1873ed627c.jpg
1 http://x.lnimg.com/photo/thumb_480/62f6afc3f50942388df9fe66e99a2ab4.jpg
1 http://x.lnimg.com/photo/thumb_480/09ba5b97da4e47b1a97bac86e125001d.jpg
1 http://x.lnimg.com/photo/thumb_480/580dcc66cd7f48d8aae8f583cd8e5e4b.jpg
1 http://x.lnimg.com/photo/thumb_480/e62f6d11449b41ff93e191f6045cb304.jpg
1 http://x.lnimg.com/photo/thumb_480/596230f07ebf471383991a99bd43420a.jpg
1 http://x.lnimg.com/photo/thumb_480/da63be695fed4617b594d19e4aa0dc7a.jpg
1 http://x.lnimg.com/photo/thumb_480/58f28244a2494c868006a97534e694e6.jpg
1 http://x.lnimg.com/photo/thumb_480/1b0246dfe4314404b32147061198cf80.jpg
1 http://x.lnimg.com/photo/thumb_480/b10a9bb8252f4d3d9638284f6f68fd6b.jpg
1 http://x.lnimg.com/photo/thumb_480/48bf185da5f74a369dd629dd183bc8b3.jpg
1 http://x.lnimg.com/photo/thumb_480/7008392330f847ecb3af7058ce0b2e3b.jpg
1 http://x.lnimg.com/photo/thumb_480/fcf5618fc7d04265a912cce9f7dec344.jpg
1 //x.lnimg.com/images/search/map/pinprofileclose2.gif
1
1 //x.lnimg.com/images/search/map/pinprofile-ViewDetails.gif
1 http://ak.t0.tiles.virtualearth.net/tiles/r0?g=5289&mkt=en-us&lbl=l0&stl=h&shading=hill&n=z
1 http://ak.dynamic.t1.tiles.virtualearth.net/comp/ch/0231?mkt=en-us&it=G,L&shading=hill&og=132&n=z&key=AhfWRPlwZlVGewL9th5vsIklmzvMkUMm8wx6pz5nXzgDlYXliZFSr6lBvgHYCZQj
2 http://x.lnimg.com/photo/thumb_480/40f175f108f5492b9cdec6486d753f8d.jpg
2 http://x.lnimg.com/photo/thumb_480/29081ed96a6349a08c27424ce3bd2842.jpg
2 http://x.lnimg.com/photo/thumb_480/29cd278e7cc34d9782d0a22782af2134.jpg
2 http://x.lnimg.com/photo/thumb_480/3979dc0f0987407bb9f825f2a0cb3fa9.jpg
2 http://x.lnimg.com/photo/thumb_480/dd06239abbf1433099ad3278607e5d7f.jpg
2 http://x.lnimg.com/photo/thumb_480/6e7c6b6076d5414b8ee59baed3dc3131.jpg
2 http://x.lnimg.com/photo/thumb_480/97027946bbf745a59d44ac1c3e9d22fe.jpg
2 http://x.lnimg.com/photo/thumb_480/396fd224e85f42aea7a10e1873ed627c.jpg
2 http://x.lnimg.com/photo/thumb_480/62f6afc3f50942388df9fe66e99a2ab4.jpg
2 http://x.lnimg.com/photo/thumb_480/09ba5b97da4e47b1a97bac86e125001d.jpg
2 http://x.lnimg.com/photo/thumb_480/580dcc66cd7f48d8aae8f583cd8e5e4b.jpg
2 http://x.lnimg.com/photo/thumb_480/e62f6d11449b41ff93e191f6045cb304.jpg
2 http://x.lnimg.com/photo/thumb_480/596230f07ebf471383991a99bd43420a.jpg
2 http://x.lnimg.com/photo/thumb_480/da63be695fed4617b594d19e4aa0dc7a.jpg
2 http://x.lnimg.com/photo/thumb_480/58f28244a2494c868006a97534e694e6.jpg
2 http://x.lnimg.com/photo/thumb_480/1b0246dfe4314404b32147061198cf80.jpg
2 http://x.lnimg.com/photo/thumb_480/b10a9bb8252f4d3d9638284f6f68fd6b.jpg
2 http://x.lnimg.com/photo/thumb_480/48bf185da5f74a369dd629dd183bc8b3.jpg
2 http://x.lnimg.com/photo/thumb_480/7008392330f847ecb3af7058ce0b2e3b.jpg
2 http://x.lnimg.com/photo/thumb_480/fcf5618fc7d04265a912cce9f7dec344.jpg
2 //x.lnimg.com/images/search/map/pinprofileclose2.gif
2
2 //x.lnimg.com/images/search/map/pinprofile-ViewDetails.gif
2 http://ak.t0.tiles.virtualearth.net/tiles/r0?g=5289&mkt=en-us&lbl=l0&stl=h&shading=hill&n=z
2 http://ak.dynamic.t1.tiles.virtualearth.net/comp/ch/0231?mkt=en-us&it=G,L&shading=hill&og=132&n=z&key=AhfWRPlwZlVGewL9th5vsIklmzvMkUMm8wx6pz5nXzgDlYXliZFSr6lBvgHYCZQj
以下是文档说评估工作的方式:Casperjs Evaluate Diagram
这是我正在抓的网站:http://looplink.ensemblere.com/SearchResults
我使用以下命令在linux上通过bash运行脚本:
casperjs --web-security=no --cookies-file=/tmp/mycookies.txt Script.js
var casper = require('casper').create({
verbose: true,
});
//two different vars for two different pages of images
var thumbNails, thumbNails2;
function getThumbNails() {
// Function to Scrape the links of thumbnails
var thumbNails = document.querySelectorAll('img');
return Array.prototype.map.call(thumbNails, function (e) {
return e.getAttribute('src');
});
};
// Scrubs target page below
casper.start('http://looplink.ensemblere.com/SearchResults');
//Runs the function on page one, below
casper.then(function () {
thumbNails = this.evaluate(getThumbNails);
});
//Confirm that we are on page one of website, below
casper.then(function (){
if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
this.echo('Second Page Is Loaded');
}
else {
this.echo('First Page Is Loaded');
}
});
//Click on the next button to go to page two, below
casper.thenClick('a.searchPagingButton.pagingNextButton');
//Confirm that we are on page two, below
casper.then(function (){
if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
this.echo('Second Page Is Loaded')
}
else {
this.echo('First Page Is Loaded')
}
});
//Runs the funciton for the second time, below; BUT HERE IS THE ISSUE *****
//idk why it pulls the same data twice, when the then statement above confirms we are on page two.
casper.then(function () {
thumbNails2 = this.evaluate(getThumbNails);
});
//Sort the data from the varaiables.
casper.run(function () {
for(var i in thumbNails) {
console.log('1 ' + thumbNails[i]);
}
for(var i in thumbNails2) {
console.log('2 ' + thumbNails2[i]);
}
this.done();
});
答案 0 :(得分:0)
好的,所以在评论中支持 Artjom B。,告诉我截图以确保页面已加载!
资源只是加载即:still loading picture
添加
this.wait(40000, function() {
this.echo('Waited for 40 seconds');
});
casper将等待40秒
对于屏幕截图调试,请尝试this.capture(/home/SuperDankDude/yourfilename.png)
我还注意到该网站的移动版本与全屏版本不同,所以我还必须使用casper.options.viewportSize = {width: 1920, height: 1080};
var casper = require('casper').create({
verbose: true,
});
//two different vars for two different pages of images
var thumbNails, thumbNails2;
casper.options.viewportSize = {width: 1920, height: 1080};
function getThumbNails() {
// Function to Scrape the links of thumbnails
var thumbNails = document.querySelectorAll('img');
return Array.prototype.map.call(thumbNails, function (e) {
return e.getAttribute('src');
});
};
// Scrubs target page below
casper.start('http://looplink.ensemblere.com/SearchResults');
//Runs the function on page one, below
casper.then(function () {
thumbNails = this.evaluate(getThumbNails);
});
//Confirm that we are on page one of website, below
casper.then(function (){
if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
this.echo('Second Page Is Loaded');
}
else {
this.echo('First Page Is Loaded')
this.capture("/home/votlon/WeB/firstpage.png");
}
});
//Click on the next button to go to page two, below
casper.thenClick('a.searchPagingButton.pagingNextButton');
//Confirm that we are on page two, below
casper.then(function (){
if (this.exists('a.searchPagingButton.pagingPreviousButton')) {
this.echo('Second Page Is Loading')
this.wait(40000, function() {
this.echo('Waited for 40 seconds');
});
this.capture("/home/votlon/WeB/secondpage.png");
}
else {
this.echo('First Page Is Loaded')
}
});
//Runs the funciton for the second time, below; BUT HERE IS THE ISSUE *****
//idk why it pulls the same data twice, when the then statement above confirms we are on page two.
casper.then(function () {
thumbNails2 = this.evaluate(getThumbNails);
});
//Sort the data from the varaiables.
casper.run(function () {
for(var i in thumbNails) {
console.log('1 ' + thumbNails[i]);
}
for(var i in thumbNails2) {
console.log('2 ' + thumbNails2[i]);
}
this.done();
});