将html变成一个庞大的数组

时间:2013-06-20 02:41:19

标签: php jquery xml dom

我有一个小项目,我必须保存整个网站的DOM,然后做一些操作,比如获取css计算风格等。(这将通过jquery完成)

目前我已经编写了几个函数来通过cURL获取HTML标记。有没有办法将这个DOM保存到多维数组中?递归?

目前即时使用此https://code.google.com/p/php-html2array/(我正在使用版本1.01),它工作正常,只是由于某种原因它缺少一些元素,如主'body'标签..和其他关键元素。网址测试是:http://www.vulytrampolines.com/

有人可以告诉我如何做到这一点,或者我如何使用Google代码来编辑php以查看正文标记?

到目前为止我的代码(Pastebin):

<?php
/**
* Website Layout Checker
*
* @package  
* @author Marais Rossouw (marais.r@vulytrampolines.com)
* @copyright Vuly
* @version 2013
* @access public
*/

require_once '../setup.php';
ini_set('max_execution_time', 6000);

class layout {

        private $_LAYOUT, $_URL, $_DOC, $_LAYOUT_ARRAY, $_SAVE_TO_JSON, $_SAVE_TO_HTML, $_HTML_BODY;

        private $_CONSOLE = array();

        public function __construct($url) {

                // Get's the contents of the page specified.
                try {
                        $client = new Zend_Http_Client;
                        $client->setUri($url);
                        $client->setConfig(array('strictredirects' => true, 'maxredirects' => 10, 'timeout' => 8));
                        $response = $client->request();

                        $this->_LAYOUT = $response->getBody();
                        $this->_URL = $url;
                } catch (Exception $e) {
                        $this->consoleLog($e);
                }

                // Creates a DOMDocument
                try {
                        $this->_INIT();
                } catch (Exception $e) {
                        $this->consoleLog($e);
                }

                // Save the files
                try {
                        file_put_contents($this->_SAVE_TO_JSON, json_encode($this->_LAYOUT_ARRAY));
                        file_put_contents($this->_SAVE_TO_HTML, $this->_LAYOUT);

                        $this->consoleLog("The JSON file was saved to: " . $this->_SAVE_TO_JSON);
                        $this->consoleLog("The HTML file was saved to: " . $this->_SAVE_TO_HTML);
                } catch (Exception $e) {
                        $this->consoleLog($e);
                }
        }

        private function _INIT() {
                $doc = new DOMDocument();

                libxml_use_internal_errors(true);

                $doc->loadHTML($this->_LAYOUT);
                $this->_DOC = new DOMXpath($doc);

                $this->consoleLog("DOMDocument created");

                $parser = new htmlParser($this->_LAYOUT);
                $this->_LAYOUT_ARRAY = $parser->toArray();


                //var_dump($this->_LAYOUT_ARRAY[0]['childNodes']);exit;

                $this->consoleLog("Dom array created");

                $this->consoleLog("There are " . count($this->_LAYOUT_ARRAY, COUNT_RECURSIVE) . " elements in the dom array");

                $this->_FILE_NAME = "VULY_LAYOUT_CHECKER-" . sha1(htmlspecialchars(trim($this->_URL)) . date("Ymd") . rand(99, 9999));
                $this->_FILE_PATH = "layout_checker\\"/*sys_get_temp_dir() . "\\"*/;

                $this->_SAVE_TO_JSON = $this->_FILE_PATH . $this->_FILE_NAME . ".txt";
                $this->_SAVE_TO_HTML = $this->_FILE_PATH . $this->_FILE_NAME . ".html";


                libxml_use_internal_errors(false);
        }

        public function toString() {
                return $this->_LAYOUT;
        }

        public function getBody() {
                $this->recurse($this->_LAYOUT_ARRAY);
                return $this->_HTML_BODY;
        }

        private function recurse($file) {
                if ($this->_HTML_BODY != false) { return; }

                for ($i = 0; $i < count($file); $i++) {
                        if ($file[$i]['childNodes']) {
                                if ($file[$i]['tag'] == "body") {
                                        $this->_HTML_BODY = $file[$i]['innerHTML'];
                                        return;
                                } else {
                                        $this->recurse($file[$i]['childNodes']);
                                }
                        }
                }
        }

        public function getJSON_FILE() {
                return file_get_contents($this->_SAVE_TO_JSON);
        }

        public function get_SAVE_TO_HTML() {
                return $this->_SAVE_TO_HTML;
        }


        public function consoleLog($string) {
                $this->_CONSOLE[] = $string;
        }

        public function renderConsole() {
                $return = "";

                $_PAD_SIZE = strlen(count($this->_CONSOLE)) + 2;

                foreach ($this->_CONSOLE as $key => $value) {
                        $return .= str_pad($key . ":", $_PAD_SIZE) . $value . "\n";
                }
                return $return;
        }

}

class htmlParser {

        //your very own separator
        //do not enter characters such as < or >
        private $separator = '~';
        //the tags that don't have any innerHTML in them
        //feel free to add some if I missed any
        private $singleTags = 'meta|img|hr|br|link|!--|!DOCTYPE|input';

        //-- Don't edit below this --

        private $html,$level;
        public $levelArray;

        function __construct($html='') {
                $this->html=$this->removeWhiteSpace($html);
                $this->level=-1;
                $this->levelArray=array();
        }
        function __destruct() {
                //nothing yet;
        }
        private function getElement($value) {
                $ar = explode($this->separator,$value);
                $ar = explode('-',$ar[1]);
                return $this->levelArray[$ar[0]][$ar[1]];
        }
        private function parseToHTML($str,$level) {
                $ar=$this->getArrayOfReplacements($str);
                foreach ($ar as $item) {
                        $elem = $this->getElement($item);
                        $str=str_replace($item,($level==0?$elem['htmlText']:'<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>'),$str);
                }
                return $str;
        }
        private function replaceSingleTags() {
                //tags like img, input etc
                $result=preg_match_all('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html, $m);
                if ($result>0) {
                        foreach ($m[0] as $id => $value) {
                                $this->html = str_replace($value,'',$this->html);
                        }
                }
        }
        private function replaceSimpleTags() {
                //tags that only have text in them (no other content)
                $result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $m);
                if ($result>0) {
                        $this->level++;
                        $oneLevel=array();
                        foreach ($m[0] as $id => $value) {
                                if ($this->level==0) $htmlText=$value;
                                else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);

                                $oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);

                                $this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
                        }
                        $this->levelArray [$this->level] = $oneLevel;
                }
        }
        private function replaceRemainingTags() {
                //tags that remain after everything
                $result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $m);
                if ($result>0) {
                        $this->level++;
                        $oneLevel=array();
                        foreach ($m[0] as $id => $value) {
                                if ($this->level==0) $htmlText=$m[3][$id];
                                else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);

                                $oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);

                                $this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
                        }
                        $this->levelArray [$this->level] = $oneLevel;
                }
        }
        private function existSimpleTags() {
                $result=preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html);
                return $result>0;
        }
        private function existSingleTags() {
                $result=preg_match('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html);
                return $result>0;
        }
        private function removeWhiteSpace ($string) {
                $string = str_replace(array("\n","\r",'&nbsp;',"\t"),'',$string);
                return preg_replace('|  +|', ' ', $string);
        }
        public function toArray($html='') {

                //first part: coding
                if ($html!='') {
                        $this->html = $this->removeWhiteSpace($html);
                }
                while ($this->existSimpleTags() || $this->existSingleTags()) {
                        $this->replaceSingleTags();
                        $this->replaceSimpleTags();
                }
                $this->replaceRemainingTags();

                //now decoding
                $ar=$this->getArray($this->html);

                return $ar;
        }
        private function getArrayOfReplacements($str) {
                $final=array();
                $ar=explode($this->separator,$str);
                for ($i=0;$i<(count($ar)-1)/2;$i++) {
                        $final []= $this->separator.$ar[$i*2+1].$this->separator;
                }
                return $final;
        }
        private function startsWithText($str) {
                $first=substr(trim(str_replace(array("\n","\r"),'',$str)),0,1);
                if ($first=='<' || $first=='>') return false;
                return true;
        }
        private function strInArray($array,$str) {
                foreach ($array as $item) {
                        if (strpos($str,$item)!==false)
                                return true;
                }
                return false;
        }
        private function getArray($html, $father='') {
                $final=array();
                if (strpos($html,$this->separator)!==false) {
                        $r=$this->getArrayOfReplacements($html);
                        foreach ($r as $i) {

                                $ar = explode($this->separator,$i);
                                $ar = explode('-',$ar[1]);
                                $elem = $this->levelArray[$ar[0]][$ar[1]];
                                $this->levelArray[$ar[0]][$ar[1]]['father'] = $father;

                                $final []= array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $father, 'childNodes' => $this->getArray($elem['text'],$i));
                        }
                }
                return $final;
        }
        public function loadNode($rep) {
                $elem = $this->getElement($rep);
                return array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $elem['father']);
        }
}

if (isset($_REQUEST['layout'])) {
        $layout = new layout($_REQUEST['layout']);
        $console = $layout->renderConsole();
        $json_file = $layout->getJSON_FILE();
        $toString = $layout->toString();
        $getBody = "http://" . $_SERVER['SERVER_NAME']."/etramp/scripts/" . $layout->get_SAVE_TO_HTML();
} else {
        $console = "";
        $json_file = "";
        $toString = "";
        $getBody = "";
}

?>

<html>
<head>
        <title>Vuly Layout Checker</title>
        <style type="text/css">
        html {
                height: 100%;
                margin:0;padding:0;
        }
        body {
                background: #728eaa;
                background: -moz-linear-gradient(top, #25303C 0%, #728EAA 100%);
                background: -webkit-gradient(linear, left top, left bottom, color-stop(0%, #25303C), color-stop(100%, #728EAA));
                font-family: sans-serif;
        }
        input, select {
                padding:10px;
        }
        select, input[type='submit'] {
                cursor:pointer;
        }
        label {
                color: #fff;
                padding-right: 10px;
        }
        form {
                margin: 50px auto 0 auto;
                width: 684px;
        }
        .text1 {
                width:49%; height:220px; resize: none; position:fixed; top:150px;
        }
        .text2 {
                width:49%; resize: none; position:fixed; top: 380px; bottom:10px; height: 58%;
        }
        </style>
        <script src="//ajax.googleapis.com/ajax/libs/jquery/1.10.1/jquery.min.js"></script>
        <script type="text/javascript">
        var file;
        $( document ).ready(function() {
        file = <?php echo $json_file; ?>;
                recurse(file);
        });
        function recurse(file) {
                console.log(file[i].tag);
                for (var i = 0; i < file.length; i++) {
                        if (file[i].childNodes) {
                                if (file[i].tag == "body") {
                                        console.log($(file[i].innerHeml, $('#NEW_LAYOUT').contents()));
                                        alert(file[i].tag);
                                } else {
                                        recurse(file[i].childNodes);
                                }              
                        }
                }
        }
        </script>
</head>
<body>

        <textarea class="text1" style="left:10px;"><?php echo $console; ?></textarea>
        <textarea class="text1" style="right:10px;"><?php echo $json_file; ?></textarea>

        <form>
                <label for="layout">Website URL:</label>
                <input type="text" name="layout" id="layout" style="width: 500px" value="<?php echo (isset($_REQUEST['layout'])) ? $_REQUEST['layout'] : "http://"; ?>">
                <input type="submit">
        </form>

        <textarea class="text2" style="left:10px;"><?php echo $toString; ?></textarea>
        <iframe id="NEW_LAYOUT" class="text2" style="right:10px;" src="<?php echo $getBody; ?>"></iframe>

</body>
</html>

1 个答案:

答案 0 :(得分:0)

我推荐像phpQuery这样的东西 https://code.google.com/p/phpquery/

那么你要做的就是cURL那个URL然后将结果传递给phpQuery,如下所示:

phpQuery::selectDocument($doc);

这里的例子是如何迭代HTML //来自最后选择的DOM的所有LI

foreach(pq('li') as $li) {
        // iteration returns PLAIN dom nodes, NOT phpQuery objects
        $tagName = $li->tagName;
        $childNodes = $li->childNodes;
        // so you NEED to wrap it within phpQuery, using pq();
        pq($li)->addClass('my-second-new-class');
}