需要在PhantomJS中打开一系列URL

时间:2015-07-30 19:02:48

标签: javascript phantomjs

我在phantomJs中创建了一个脚本。它的作用是,从特定页面中获取一些工作正常的元素。

以下是代码:

var page = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
// page.settings.javascriptEnabled = false;

var steps = [

    function() { //Load Page
        page.open("http://www.example.com/mobiles/");
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
    },

    function() { //Fetch Products
        page.onCallback = function(result) {
            var fs = require('fs');
            fs.write('product-list.csv', result, 'w+');
        };

        page.evaluate(function() {
            var arr_mainList = new Array();
            var arr_innerList = new Array();

            try {
                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                    window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
                    //window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href + ", ");

                    var myWindow = window.open(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href);
                    console.log(myWindow.getElementsByClassName("item_desc")[0].textContent);
                    myWindow.close();

                    if (arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href.length > 43) {
                        var innerURL = arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                        console.log(innerURL);
                    }

                    window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                    arr_innerList.push(arr_mainList[i]); 

                    for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {                 
                        if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                        }
                        else {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                        }
                    };
                    //window.callPhantom(", ");
                    window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                    window.callPhantom("\n");
                };

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
    console.log("step " + (testindex + 1));
    steps[testindex]();
    testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function() {
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);

现在,如果我运行该程序,我将获得csv文件中的所有信息。除了它进入window.open,phantomJs停止。我知道我无法在page.evaluate中打开新页面。但我需要获取产品说明并将其添加到csv文件中以代替产品链接。我一直在寻找几个小时,任何帮助都会很好。 注意:我的限制是我必须使用phantomJs。

1 个答案:

答案 0 :(得分:0)

我稍微修改了你的脚本。所以现在你可以做任何你想做的事。请记住,不要让很多物品报废,否则你会遇到记忆问题。因此,如果在已使用的网站中存在分页,则使用新功能。 在此代码中,我假设您需要描述每个设备,但您也可以访问其他元素。

注意:您可能知道跨域策略不允许我们使用javascript / jQuery访问iFrame,这将是一个巨大的缺陷。你必须添加

  

- 网络安全=无

在cmd / terminal中执行脚本时

标记。

var page = new WebPage(), innerPage = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
//page.settings.javascriptEnabled = false;

//IMPORTANT FLAGS
//--web-security=yes/no

var steps = [
  function() { //Load Page
    page.open("http://www.example.com/mobiles-apple/", function() {
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");

        page.evaluate(function() {
            try {
                $("#main1").append('<div id="inner-data_iframes"></div>');

                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    var iFrameAdd = document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                    $("#inner-data_iframes").append('<iframe id="myIframe' + [i] + '" src="' + iFrameAdd + '"></iframe>');
                    window.document.body.scrollTop = document.body.scrollHeight;
                }
                console.log("Mission Successful.");
            }
            catch(ex) {
                console.log("Failed to add iFrame.");
            }
        });
    });
  },

  function() { //Fetch Products
      page.onCallback = function(result) {
          var fs = require('fs');
          fs.write('product-list.csv', result, 'w+');
    };

    page.evaluate(function() {
        var arr_mainList = new Array();
        var arr_innerList = new Array();

        try {
            for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");

                var desc = $("#myIframe" + [i]).contents().find(".item_desc").html();
                desc = desc.replace(/,/g, "");
                window.callPhantom(desc + ", ");

                window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                arr_innerList.push(arr_mainList[i]); 

                for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {

                    if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                    }
                    else {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                    }
                }

                window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                window.callPhantom("\n");
            }

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
        console.log("step " + (testindex + 1));
        steps[testindex]();
        testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function(){
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);