我对casperjs有点严重的问题。 我尝试从网站上提取640行数据。第一步是找到找到的总数据(即640),之后我迭代所有找到的数据并存储到数据库fia sendAJAX(同步) 这是我的代码
try{
casper.then(function(){
if(total_page > 0){
var total_hotel = 0;
//this.echo("Hit here");
for(var j=0; j<listHotelPage.length; j++){
total_hotel += listHotelPage[j].listHotelObj.length;
}
//save hotels amount on related city
var save = this.evaluate(function(total_hotel, city_code){
var wsurl = "http://localhost:9000/Cities/saveHotelNum";
var data = new Object();
data.cityCode = city_code;
data.hotelNum = total_hotel;
__utils__.sendAJAX(wsurl, "POST" , data, false, { contentType: "application/x-www-form-urlencoded" });
}, total_hotel, city_code);
this.echo("Saved Hotel With Amount = "+total_hotel);
//save all hotels found on related city
//this.echo("Insert 1");
this.evaluate(function(listHotelPage, city_code){
var save_hotel_url = "http://localhost:9000/Hotels/saveHotelRest";
var data = new Object();
window.__utils__.echo("List Hotel Page Length :"+listHotelPage.length);
window.__utils__.echo("City Code :"+city_code);
for(var i=0; i<listHotelPage.length; i++){
window.__utils__.echo("Iteration #"+i);
for(var j=0; j<listHotelPage[i].listHotelObj.length; j++){
data = new Object();
data.hotelCode = listHotelPage[i].listHotelObj[j].hotel_code;
data.hotelName = listHotelPage[i].listHotelObj[j].hotel_name;
data.hotelAddress = listHotelPage[i].listHotelObj[j].hotel_address;
data.photo = listHotelPage[i].listHotelObj[j].base64img;
data.hotelStar = listHotelPage[i].listHotelObj[j].star_score;
data.cityCode = city_code;
__utils__.sendAJAX(save_hotel_url, "POST" , data, false, { contentType: "application/x-www-form-urlencoded" });
var room_length = listHotelPage[i].listHotelObj[j].listDescriptionObj.length;
var room_data = new Object();
var save_room_url = "http://localhost:9000/Rooms/saveRoomRest";
//window.__utils__.echo("Save 2.5");
for(var k=0; k<room_length; k++){
room_data = new Object();
room_data.hotelCode = data.hotelCode;
room_data.categoryName = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].room_category;
room_data.roomService = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].room_service;
room_data.roomPrice = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].room_price;
room_data.available = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].available;
room_data.currencyCode = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].currencyCode;
//window.__utils__.echo("Save 3");
__utils__.sendAJAX(save_room_url, "POST" , room_data, false, { contentType: "application/x-www-form-urlencoded" });
}
window.__utils__.echo("Save #"+i);
}
}
}, listHotelPage, city_code);
}
else if(total_page == null || total_page <= 0 || total_page == ""){
this.evaluate(function(total_hotel, city_code){
var wsurl = "http://localhost:9000/Cities/saveHotelNum";
var data = new Object();
data.cityCode = city_code;
data.hotelNum = -1;
__utils__.sendAJAX(wsurl, "POST" , data, false, { contentType: "application/x-www-form-urlencoded" });
}, total_hotel, city_code);
}
time2 = new Date();
var diff = Math.abs(time1 - time2);
this.echo("Execution time :"+diff+" ms");
});
date_counter++;
}
catch(e){
casper.then(function(){
this.echo("Error 12");
});
}
可疑原因在于此部分:
this.evaluate(function(listHotelPage, city_code){}
我尝试评估评估代码中的所有代码,但崩溃仍然继续......
但是当我试图评论所有这一部分时,它的表现相当不错。事故已经消失......
再一次,如果我添加另一个评估,它只回显这样的文本(它取代了我之前提到的评估代码)
this.evaluate(function(listHotelPage){
window.__utils__.echo("List Hotel Page Length :"+listHotelPage.length);
}, listHotelPage);
它仍然会导致崩溃。
我仍然无法弄清楚为什么会这样。我认为这可能是因为使用了评估代码。但是如何重新评估代码会带来这样的混乱?
有什么建议吗?
这是填充listHotelPage
的方式:
hotel_number = this.evaluate(function(){ return document.querySelectorAll("div#dResult > div").length; });
//this.echo("Hotel Number :"+hotel_number);
listHotelObj = new Array();
for(var i=1; i<=parseInt(hotel_number); i++){
category_len = this.evaluate(function(i){ return document.querySelectorAll("div#dResult > div:nth-child("+i+") > div > div:nth-child(4) > div > div:nth-child(1) > span > div").length; }, i);
div_hotel_id = this.evaluate(function(i){ return document.querySelector("div#dResult > div:nth-child("+i+")").id; }, i);
hotel_code = div_hotel_id.replace("Display", "");
hotel_name = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/strong/a[1]"));
hotel_address = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/em/label"));
hotel_style = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/div[1]/span"));
hotel_location = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/div[2]/span"));
star_amount = this.evaluate(function(i){ return document.querySelectorAll("#dResult > div:nth-child("+i+") > div > div:nth-child(2) > div > div:nth-child(3) > div > label > img").length; }, i);
photo_url = this.evaluate(function(i){ return document.querySelector("#dResult > div:nth-child("+i+") > div > div:nth-child(2) > div > div:nth-child(1) > a > img").src; }, i);
base64img = this.base64encode(photo_url);
star_score = 0;
var detail_url = "http://www.mgholiday.com/b2b/Accom/HotelDescription.php?Code="+hotel_code;
listDescriptionObj = new Array();
for(var rows=1; rows <= category_len; rows++){
room_category = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[4]/div/div[1]/span/div["+rows+"]/div[1]/span"));
room_service = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[4]/div/div[1]/span/div["+rows+"]/div[2]/span"));
room_price = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[4]/div/div[1]/span/div["+rows+"]/div[3]/span"));
image_src = this.evaluate(function(rows, i){
return jQuery("div#dResult > div:nth-child("+i+") > div > div:nth-child(4) > div > div:nth-child(1) > span > div:nth-child("+rows+") > div:nth-child(4) > span > img")[0].src;
}, rows, i);
url_img_arr = image_src.split("/");
status_img = url_img_arr[url_img_arr.length-1];
if(status_img == "btnAV-v3.gif"){
available = 1;
}
else{
available = 0;
}
var price_part = room_price.split(" "); //parse price, get online first array index (removing currency)
var currencyCode = price_part[1];
var raw_price = price_part[0].split(","); //split by comma
var pure_price = raw_price[0]; //get first array index of parsed_price
for(var a=0; a<raw_price.length-1; a++){
pure_price = pure_price * 1000;
}
descriptionObj = new Object();
descriptionObj.room_category = room_category;
descriptionObj.room_service = room_service;
descriptionObj.room_price = pure_price;
descriptionObj.available = available;
descriptionObj.currencyCode = currencyCode;
listDescriptionObj.push(descriptionObj);
}
for(var rows=1; rows <= star_amount; rows++){
var star_url = this.evaluate(function(i, rows){ return document.querySelector("#dResult > div:nth-child("+i+") > div > div:nth-child(2) > div > div:nth-child(3) > div > label > img:nth-child("+rows+")").src; }, i, rows);
var star_name_array = star_url.split("/");
var star_name = star_name_array[star_name_array.length-1];
if(star_name == "star1.gif"){
star_score += 1;
}
if(star_name == "starh.gif"){
star_score += 0.5;
}
if(star_name == "star0.gif"){
star_score += 0;
}
}
hotelObj = new Object();
hotelObj.hotel_code = hotel_code;
hotelObj.hotel_name = hotel_name;
hotelObj.base64img = base64img;
hotelObj.star_score = star_score;
hotelObj.hotel_address = hotel_address;
hotelObj.hotel_location = hotel_location;
hotelObj.hotel_style = hotel_style;
hotelObj.hotel_address = hotel_address;
hotelObj.phone_number = phone_number;
hotelObj.fax_number = fax_number;
hotelObj.total_room = total_room;
hotelObj.listDescriptionObj = listDescriptionObj;
listHotelObj.push(hotelObj);
}
pageResultObj = new Object();
pageResultObj.page_num = (page_iterator+1);
pageResultObj.listHotelObj = listHotelObj;
listHotelPage.push(pageResultObj);
if(page_iterator < total_page-1){
this.click(x("//*[@id='pg-top-cnt-"+(page_iterator+1)+"']"));
}
page_iterator++;
答案 0 :(得分:0)
我找到了解决方案。 它与我的listHotelPage无关。 问题是,当我在evaluate函数中运行大量js代码时,它会发生错误(phantomjs崩溃),因为evaluate无法运行或保存太多数据。 我试图传递listHotelPage并对其进行逻辑操作。结果是一样的,崩溃。但是如果listHotelPage填充代码真的需要,那么我们在这里......
hotel_number = this.evaluate(function(){ return document.querySelectorAll("div#dResult > div").length; });
//this.echo("Hotel Number :"+hotel_number);
listHotelObj = new Array();
for(var i=1; i<=parseInt(hotel_number); i++){
category_len = this.evaluate(function(i){ return document.querySelectorAll("div#dResult > div:nth-child("+i+") > div > div:nth-child(4) > div > div:nth-child(1) > span > div").length; }, i);
div_hotel_id = this.evaluate(function(i){ return document.querySelector("div#dResult > div:nth-child("+i+")").id; }, i);
hotel_code = div_hotel_id.replace("Display", "");
hotel_name = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/strong/a[1]"));
hotel_address = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/em/label"));
hotel_style = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/div[1]/span"));
hotel_location = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[2]/div/div[2]/div[2]/span"));
star_amount = this.evaluate(function(i){ return document.querySelectorAll("#dResult > div:nth-child("+i+") > div > div:nth-child(2) > div > div:nth-child(3) > div > label > img").length; }, i);
photo_url = this.evaluate(function(i){ return document.querySelector("#dResult > div:nth-child("+i+") > div > div:nth-child(2) > div > div:nth-child(1) > a > img").src; }, i);
base64img = this.base64encode(photo_url);
star_score = 0;
var detail_url = "http://www.mgholiday.com/b2b/Accom/HotelDescription.php?Code="+hotel_code;
listDescriptionObj = new Array();
for(var rows=1; rows <= category_len; rows++){
room_category = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[4]/div/div[1]/span/div["+rows+"]/div[1]/span"));
room_service = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[4]/div/div[1]/span/div["+rows+"]/div[2]/span"));
room_price = this.fetchText(x("//*[@id='dResult']/div["+i+"]/div/div[4]/div/div[1]/span/div["+rows+"]/div[3]/span"));
image_src = this.evaluate(function(rows, i){
return jQuery("div#dResult > div:nth-child("+i+") > div > div:nth-child(4) > div > div:nth-child(1) > span > div:nth-child("+rows+") > div:nth-child(4) > span > img")[0].src;
}, rows, i);
url_img_arr = image_src.split("/");
status_img = url_img_arr[url_img_arr.length-1];
if(status_img == "btnAV-v3.gif"){
available = 1;
}
else{
available = 0;
}
var price_part = room_price.split(" "); //parse price, get online first array index (removing currency)
var currencyCode = price_part[1];
var raw_price = price_part[0].split(","); //split by comma
var pure_price = raw_price[0]; //get first array index of parsed_price
for(var a=0; a<raw_price.length-1; a++){
pure_price = pure_price * 1000;
}
descriptionObj = new Object();
descriptionObj.room_category = room_category;
descriptionObj.room_service = room_service;
descriptionObj.room_price = pure_price;
descriptionObj.available = available;
descriptionObj.currencyCode = currencyCode;
listDescriptionObj.push(descriptionObj);
}
for(var rows=1; rows <= star_amount; rows++){
var star_url = this.evaluate(function(i, rows){ return document.querySelector("#dResult > div:nth-child("+i+") > div > div:nth-child(2) > div > div:nth-child(3) > div > label > img:nth-child("+rows+")").src; }, i, rows);
var star_name_array = star_url.split("/");
var star_name = star_name_array[star_name_array.length-1];
if(star_name == "star1.gif"){
star_score += 1;
}
if(star_name == "starh.gif"){
star_score += 0.5;
}
if(star_name == "star0.gif"){
star_score += 0;
}
}
hotelObj = new Object();
hotelObj.hotel_code = hotel_code;
hotelObj.hotel_name = hotel_name;
hotelObj.base64img = base64img;
hotelObj.star_score = star_score;
hotelObj.hotel_address = hotel_address;
hotelObj.hotel_location = hotel_location;
hotelObj.hotel_style = hotel_style;
hotelObj.hotel_address = hotel_address;
hotelObj.phone_number = phone_number;
hotelObj.fax_number = fax_number;
hotelObj.total_room = total_room;
hotelObj.listDescriptionObj = listDescriptionObj;
listHotelObj.push(hotelObj);
}
pageResultObj = new Object();
pageResultObj.page_num = (page_iterator+1);
pageResultObj.listHotelObj = listHotelObj;
listHotelPage.push(pageResultObj);
if(page_iterator < total_page-1){
this.click(x("//*[@id='pg-top-cnt-"+(page_iterator+1)+"']"));
}
page_iterator++;
抱歉我的打字丢失了......
我们走了......
我从评估方法中提取代码......就像这样......
for(var i=0; i<listHotelPage.length; i++){
for(var j=0; j<listHotelPage[i].listHotelObj.length; j++){
data = new Object();
data.hotelCode = listHotelPage[i].listHotelObj[j].hotel_code;
data.hotelName = listHotelPage[i].listHotelObj[j].hotel_name;
data.hotelAddress = listHotelPage[i].listHotelObj[j].hotel_address;
data.photo = listHotelPage[i].listHotelObj[j].base64img;
data.hotelStar = listHotelPage[i].listHotelObj[j].star_score;
data.cityCode = city_code;
this.evaluate(function(save_hotel_url, data){
__utils__.sendAJAX(save_hotel_url, "POST" , data, false, { contentType: "application/x-www-form-urlencoded" });
}, save_hotel_url, data);
var room_length = listHotelPage[i].listHotelObj[j].listDescriptionObj.length;
var room_data = new Object();
var save_room_url = "http://localhost:9000/Rooms/saveRoomRest";
for(var k=0; k<room_length; k++){
room_data = new Object();
room_data.hotelCode = data.hotelCode;
room_data.categoryName = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].room_category;
room_data.roomService = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].room_service;
room_data.roomPrice = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].room_price;
room_data.available = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].available;
room_data.currencyCode = listHotelPage[i].listHotelObj[j].listDescriptionObj[k].currencyCode;
this.evaluate(function(save_room_url, room_data){
__utils__.sendAJAX(save_room_url, "POST" , room_data, false, { contentType: "application/x-www-form-urlencoded" });
}, save_room_url, room_data);
}
}
}
正如您所看到的,我在evaluate函数之外拉出for循环,因此它将为评估带来较少的执行...
在最后一节中,this.evaluate函数仅评估__utils__.sendAJAX(save_room_url, "POST" , room_data, false, { contentType: "application/x-www-form-urlencoded" });
On the previous example, all iteration code and all scraped data was executed in this.evaluate()