我有一个小项目,我必须保存整个网站的DOM,然后做一些操作,比如获取css计算风格等。(这将通过jquery完成)
目前我已经编写了几个函数来通过cURL获取HTML标记。有没有办法将这个DOM保存到多维数组中?递归?
目前即时使用此https://code.google.com/p/php-html2array/(我正在使用版本1.01),它工作正常,只是由于某种原因它缺少一些元素,如主'body'标签..和其他关键元素。网址测试是:http://www.vulytrampolines.com/
有人可以告诉我如何做到这一点,或者我如何使用Google代码来编辑php以查看正文标记?
到目前为止我的代码(Pastebin):
<?php
/**
* Website Layout Checker
*
* @package
* @author Marais Rossouw (marais.r@vulytrampolines.com)
* @copyright Vuly
* @version 2013
* @access public
*/
require_once '../setup.php';
ini_set('max_execution_time', 6000);
class layout {
private $_LAYOUT, $_URL, $_DOC, $_LAYOUT_ARRAY, $_SAVE_TO_JSON, $_SAVE_TO_HTML, $_HTML_BODY;
private $_CONSOLE = array();
public function __construct($url) {
// Get's the contents of the page specified.
try {
$client = new Zend_Http_Client;
$client->setUri($url);
$client->setConfig(array('strictredirects' => true, 'maxredirects' => 10, 'timeout' => 8));
$response = $client->request();
$this->_LAYOUT = $response->getBody();
$this->_URL = $url;
} catch (Exception $e) {
$this->consoleLog($e);
}
// Creates a DOMDocument
try {
$this->_INIT();
} catch (Exception $e) {
$this->consoleLog($e);
}
// Save the files
try {
file_put_contents($this->_SAVE_TO_JSON, json_encode($this->_LAYOUT_ARRAY));
file_put_contents($this->_SAVE_TO_HTML, $this->_LAYOUT);
$this->consoleLog("The JSON file was saved to: " . $this->_SAVE_TO_JSON);
$this->consoleLog("The HTML file was saved to: " . $this->_SAVE_TO_HTML);
} catch (Exception $e) {
$this->consoleLog($e);
}
}
private function _INIT() {
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($this->_LAYOUT);
$this->_DOC = new DOMXpath($doc);
$this->consoleLog("DOMDocument created");
$parser = new htmlParser($this->_LAYOUT);
$this->_LAYOUT_ARRAY = $parser->toArray();
//var_dump($this->_LAYOUT_ARRAY[0]['childNodes']);exit;
$this->consoleLog("Dom array created");
$this->consoleLog("There are " . count($this->_LAYOUT_ARRAY, COUNT_RECURSIVE) . " elements in the dom array");
$this->_FILE_NAME = "VULY_LAYOUT_CHECKER-" . sha1(htmlspecialchars(trim($this->_URL)) . date("Ymd") . rand(99, 9999));
$this->_FILE_PATH = "layout_checker\\"/*sys_get_temp_dir() . "\\"*/;
$this->_SAVE_TO_JSON = $this->_FILE_PATH . $this->_FILE_NAME . ".txt";
$this->_SAVE_TO_HTML = $this->_FILE_PATH . $this->_FILE_NAME . ".html";
libxml_use_internal_errors(false);
}
public function toString() {
return $this->_LAYOUT;
}
public function getBody() {
$this->recurse($this->_LAYOUT_ARRAY);
return $this->_HTML_BODY;
}
private function recurse($file) {
if ($this->_HTML_BODY != false) { return; }
for ($i = 0; $i < count($file); $i++) {
if ($file[$i]['childNodes']) {
if ($file[$i]['tag'] == "body") {
$this->_HTML_BODY = $file[$i]['innerHTML'];
return;
} else {
$this->recurse($file[$i]['childNodes']);
}
}
}
}
public function getJSON_FILE() {
return file_get_contents($this->_SAVE_TO_JSON);
}
public function get_SAVE_TO_HTML() {
return $this->_SAVE_TO_HTML;
}
public function consoleLog($string) {
$this->_CONSOLE[] = $string;
}
public function renderConsole() {
$return = "";
$_PAD_SIZE = strlen(count($this->_CONSOLE)) + 2;
foreach ($this->_CONSOLE as $key => $value) {
$return .= str_pad($key . ":", $_PAD_SIZE) . $value . "\n";
}
return $return;
}
}
class htmlParser {
//your very own separator
//do not enter characters such as < or >
private $separator = '~';
//the tags that don't have any innerHTML in them
//feel free to add some if I missed any
private $singleTags = 'meta|img|hr|br|link|!--|!DOCTYPE|input';
//-- Don't edit below this --
private $html,$level;
public $levelArray;
function __construct($html='') {
$this->html=$this->removeWhiteSpace($html);
$this->level=-1;
$this->levelArray=array();
}
function __destruct() {
//nothing yet;
}
private function getElement($value) {
$ar = explode($this->separator,$value);
$ar = explode('-',$ar[1]);
return $this->levelArray[$ar[0]][$ar[1]];
}
private function parseToHTML($str,$level) {
$ar=$this->getArrayOfReplacements($str);
foreach ($ar as $item) {
$elem = $this->getElement($item);
$str=str_replace($item,($level==0?$elem['htmlText']:'<'.$elem['tag'].$elem['attr'].'>'.$elem['htmlText'].'</'.$elem['tag'].'>'),$str);
}
return $str;
}
private function replaceSingleTags() {
//tags like img, input etc
$result=preg_match_all('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html, $m);
if ($result>0) {
foreach ($m[0] as $id => $value) {
$this->html = str_replace($value,'',$this->html);
}
}
}
private function replaceSimpleTags() {
//tags that only have text in them (no other content)
$result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html, $m);
if ($result>0) {
$this->level++;
$oneLevel=array();
foreach ($m[0] as $id => $value) {
if ($this->level==0) $htmlText=$value;
else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);
$oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);
$this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
}
$this->levelArray [$this->level] = $oneLevel;
}
}
private function replaceRemainingTags() {
//tags that remain after everything
$result=preg_match_all('/<(.[^\s]*)(.[^><]*)?>(.*)?<\/\1>/is', $this->html, $m);
if ($result>0) {
$this->level++;
$oneLevel=array();
foreach ($m[0] as $id => $value) {
if ($this->level==0) $htmlText=$m[3][$id];
else $htmlText=$this->parseToHTML($m[3][$id],$this->level-1);
$oneLevel []= array('str' => $value, 'rep' => $this->separator.$this->level.'-'.$id.$this->separator, 'tag' => $m[1][$id], 'level' => $this->level, 'text' => $m[3][$id], 'attr' => $m[2][$id] , 'htmlText' => $htmlText);
$this->html = str_replace($value,$this->separator.$this->level.'-'.$id.$this->separator,$this->html);
}
$this->levelArray [$this->level] = $oneLevel;
}
}
private function existSimpleTags() {
$result=preg_match('/<(.[^\s]*)(.[^><]*)?>(.[^<]*)?<\/\1>/is', $this->html);
return $result>0;
}
private function existSingleTags() {
$result=preg_match('/<('.$this->singleTags.')(.[^><]*)?>/is', $this->html);
return $result>0;
}
private function removeWhiteSpace ($string) {
$string = str_replace(array("\n","\r",' ',"\t"),'',$string);
return preg_replace('| +|', ' ', $string);
}
public function toArray($html='') {
//first part: coding
if ($html!='') {
$this->html = $this->removeWhiteSpace($html);
}
while ($this->existSimpleTags() || $this->existSingleTags()) {
$this->replaceSingleTags();
$this->replaceSimpleTags();
}
$this->replaceRemainingTags();
//now decoding
$ar=$this->getArray($this->html);
return $ar;
}
private function getArrayOfReplacements($str) {
$final=array();
$ar=explode($this->separator,$str);
for ($i=0;$i<(count($ar)-1)/2;$i++) {
$final []= $this->separator.$ar[$i*2+1].$this->separator;
}
return $final;
}
private function startsWithText($str) {
$first=substr(trim(str_replace(array("\n","\r"),'',$str)),0,1);
if ($first=='<' || $first=='>') return false;
return true;
}
private function strInArray($array,$str) {
foreach ($array as $item) {
if (strpos($str,$item)!==false)
return true;
}
return false;
}
private function getArray($html, $father='') {
$final=array();
if (strpos($html,$this->separator)!==false) {
$r=$this->getArrayOfReplacements($html);
foreach ($r as $i) {
$ar = explode($this->separator,$i);
$ar = explode('-',$ar[1]);
$elem = $this->levelArray[$ar[0]][$ar[1]];
$this->levelArray[$ar[0]][$ar[1]]['father'] = $father;
$final []= array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $father, 'childNodes' => $this->getArray($elem['text'],$i));
}
}
return $final;
}
public function loadNode($rep) {
$elem = $this->getElement($rep);
return array( 'tag' => $elem['tag'], 'innerHTML' => $elem['htmlText'], 'repl' => $elem['rep'],'stratr' => $elem['attr'], 'level' => $elem['level'], 'father' => $elem['father']);
}
}
if (isset($_REQUEST['layout'])) {
$layout = new layout($_REQUEST['layout']);
$console = $layout->renderConsole();
$json_file = $layout->getJSON_FILE();
$toString = $layout->toString();
$getBody = "http://" . $_SERVER['SERVER_NAME']."/etramp/scripts/" . $layout->get_SAVE_TO_HTML();
} else {
$console = "";
$json_file = "";
$toString = "";
$getBody = "";
}
?>
<html>
<head>
<title>Vuly Layout Checker</title>
<style type="text/css">
html {
height: 100%;
margin:0;padding:0;
}
body {
background: #728eaa;
background: -moz-linear-gradient(top, #25303C 0%, #728EAA 100%);
background: -webkit-gradient(linear, left top, left bottom, color-stop(0%, #25303C), color-stop(100%, #728EAA));
font-family: sans-serif;
}
input, select {
padding:10px;
}
select, input[type='submit'] {
cursor:pointer;
}
label {
color: #fff;
padding-right: 10px;
}
form {
margin: 50px auto 0 auto;
width: 684px;
}
.text1 {
width:49%; height:220px; resize: none; position:fixed; top:150px;
}
.text2 {
width:49%; resize: none; position:fixed; top: 380px; bottom:10px; height: 58%;
}
</style>
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.10.1/jquery.min.js"></script>
<script type="text/javascript">
var file;
$( document ).ready(function() {
file = <?php echo $json_file; ?>;
recurse(file);
});
function recurse(file) {
console.log(file[i].tag);
for (var i = 0; i < file.length; i++) {
if (file[i].childNodes) {
if (file[i].tag == "body") {
console.log($(file[i].innerHeml, $('#NEW_LAYOUT').contents()));
alert(file[i].tag);
} else {
recurse(file[i].childNodes);
}
}
}
}
</script>
</head>
<body>
<textarea class="text1" style="left:10px;"><?php echo $console; ?></textarea>
<textarea class="text1" style="right:10px;"><?php echo $json_file; ?></textarea>
<form>
<label for="layout">Website URL:</label>
<input type="text" name="layout" id="layout" style="width: 500px" value="<?php echo (isset($_REQUEST['layout'])) ? $_REQUEST['layout'] : "http://"; ?>">
<input type="submit">
</form>
<textarea class="text2" style="left:10px;"><?php echo $toString; ?></textarea>
<iframe id="NEW_LAYOUT" class="text2" style="right:10px;" src="<?php echo $getBody; ?>"></iframe>
</body>
</html>
答案 0 :(得分:0)
我推荐像phpQuery这样的东西 https://code.google.com/p/phpquery/
那么你要做的就是cURL那个URL然后将结果传递给phpQuery,如下所示:
phpQuery::selectDocument($doc);
这里的例子是如何迭代HTML //来自最后选择的DOM的所有LI
foreach(pq('li') as $li) {
// iteration returns PLAIN dom nodes, NOT phpQuery objects
$tagName = $li->tagName;
$childNodes = $li->childNodes;
// so you NEED to wrap it within phpQuery, using pq();
pq($li)->addClass('my-second-new-class');
}