使用PHP DOMDocument生成基于HTML标头的多维数组

时间:2018-02-20 20:02:36

标签: php multidimensional-array domdocument php-7

首先,我确定我应该在几个小时之前抓住它,但我似乎无法看到它。

情况

所以,情况是我正在尝试设置一个可重用(非递归)函数来将HTML块解析为由头元素拆分的多维数组。基本上,最终结果不应超过7个等级(H1-6和H6的子女)。在将第一个H1放入标有“顶部”的“特殊”部分之前,还有一个元素可以捕获。

代码

<?php
    function sortEntrySections($section, $level = 1) {
        if(is_array($section)) {
            $i = 0;
            $ele = 'h' . $level;
            $sectionStructure = $level === 1 ? array(array('title' => 'Top', 'children' => array())) : array();
            foreach($section as $element) {
                if($element->tagName != $ele && isset($sectionStructure[$i]) && is_array($sectionStructure[$i])) {
                    array_push($sectionStructure[$i]['children'], $element);
                } else {
                    $i++;
                    if($element->tagName == $ele) {
                        $sectionStructure[$i] = array('title' => $element->textContent, 'children' => array($element));
                    } else {
                        $sectionStructure[$i] = $element;
                    }
                }
            }
            return $sectionStructure;
        }
        return $section;
    }

    function breakupEntry() {
        $body = new DOMDocument();
        @$body->loadHTML(mb_convert_encoding(html_entity_decode($GLOBALS['libraryEntry']['body']), 'HTML-ENTITIES', 'UTF-8'));
        $formattedBody = new DOMDocument();

        /* Build Multidimensional Array of Sections */
        $i = 0;
        $elements = array();
        foreach($body->getElementsByTagName('*') as $child) {
            if($child->tagName !== 'html' && $child->tagName !== 'body' && $child->parentNode->tagName === 'body') {
                array_push($elements, $formattedBody->importNode($child, true));
            }
        }
        $sections = sortEntrySections($elements, 1);
        for($i = 1; $i < sizeof($sections); $i++) {
            $childrenH1 = sortEntrySections($sections[$i]['children'], 2);
            if(isset($childrenH1['children'])) {
                foreach($childrenH1['children'] as $j => $childH1) {
                    $childrenH2 = sortEntrySections($childH1, 3);
                    if(isset($childrenH2['children'])) {
                        foreach($childrenH2['children'] as $k => $childH2) {
                            $childrenH3 = sortEntrySections($childH2, 4);
                            if(isset($childrenH3['children'])) {
                                foreach($childrenH3['children'] as $l => $childH3) {
                                    $childrenH4 = sortEntrySections($childH3, 5);
                                    if(isset($childrenH4['children'])) {
                                        foreach($childrenH4['children'] as $m => $childH4) {
                                            $childrenH4[$m]['children'] = sortEntrySections($childH4, 6);
                                        }
                                    }
                                    $childrenH3['children'][$l] = $childrenH4;
                                }
                            }
                            $childrenH2['children'][$k] = $childrenH3;
                        }
                    }
                    $childrenH1['children'][$j] = $childrenH2;
                }
            }
            $sections[$i]['children'] = $childrenH1;
        }
        return $sections;
    }

    $body = <<<EOD
<p>Pre Header Section Content 1</p>
<p>Pre Header Section Content 2</p>
<p>Pre Header Section Content 3</p>
<h1>Header 1</h1>
<p>Header 1 Section Content 1</p>
<p>Header 1 Section Content 2</p>
<p>Header 1 Section Content 3</p>
<h2>Header 1.1</h2>
<p>Header 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Section Content 3</p>
<h3>Header 1.1.1</h3>
<p>Header 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Section Content 3</p>
<h4>Header 1.1.1.1</h4>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Section Content 3</p>
<h5>Header 1.1.1.1.1</h5>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 3</p>
<h6>Header 1.1.1.1.1.1</h6>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 3</p>
<h6>Header 1.1.1.1.1.2</h6>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 3</p>
<h6>Header 1.1.1.1.1.3</h6>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 3</p>
<h5>Header 1.1.1.1.2</h5>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 3</p>
<h5>Header 1.1.1.1.3</h5>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 3</p>
<h4>Header 1.1.1.2</h4>
<p>Header 1 Subheader 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 2 Section Content 3</p>
<h4>Header 1.1.1.3</h4>
<p>Header 1 Subheader 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 3 Section Content 3</p>
<h3>Header 1.1.2</h3>
<p>Header 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 2 Section Content 3</p>
<h3>Header 1.1.3</h3>
<p>Header 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 3 Section Content 3</p>
<h2>Header 1.2</h2>
<p>Header 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 2 Section Content 3</p>
<h2>Header 1.3</h2>
<p>Header 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 3 Section Content 3</p>
<h1>Header 2</h1>
<p>Header 2 Section Content 1</p>
<p>Header 2 Section Content 2</p>
<p>Header 2 Section Content 3</p>
<h1>Header 3</h1>
<p>Header 3 Section Content 1</p>
<p>Header 3 Section Content 2</p>
<p>Header 3 Section Content 3</p>
EOD;
    $libraryEntry = array('body' => $body);

    $results = breakupEntry();

    echo '<textarea>'; var_dump($results); echo '</textarea>';
?>

结果

https://pastebin.com/JLftvXdB

预期

https://pastebin.com/tzqxu8q4

1 个答案:

答案 0 :(得分:1)

我把这个东西改写了六次,每个都给出了一个不同的问题,我一直坚持下去。最后,我将其重写为有限的递归函数,使用$level变量的限制来确保它不超出预期范围。

<?php
    function sortEntrySections($section, $level = 1) {
        if(is_array($section)) {
            $i = 0;
            $level = intval($level);
            $level = $level > 6 ? 6 : ($level < 1 ? 1 : $level);
            $ele = 'h' . $level;
            $sectionStructure = $level === 1 ? array(array('title' => 'Top', 'children' => array())) : array();
            foreach($section as $element) {
                if($element->tagName != $ele && isset($sectionStructure[$i]) && is_array($sectionStructure[$i])) {
                    array_push($sectionStructure[$i]['children'], $element);
                } else {
                    $i++;
                    if($element->tagName == $ele) {
                        $sectionStructure[$i] = array('title' => $element->textContent, 'children' => array($element));
                    } else {
                        $sectionStructure[$i] = $element;
                    }
                }
            }
            foreach($sectionStructure as $i => $subsection) {
                if(is_array($subsection) && isset($subsection['children']) && $level < 6) {
                    $sectionStructure[$i]['children'] = sortEntrySections($subsection['children'], $level + 1);
                }
            }
            return $sectionStructure;
        }
        return $section;
    }

    function breakupEntry() {
        $body = new DOMDocument();
        @$body->loadHTML(mb_convert_encoding(html_entity_decode($GLOBALS['libraryEntry']['body']), 'HTML-ENTITIES', 'UTF-8'));
        $formattedBody = new DOMDocument();

        /* Build Multidimensional Array of Sections */
        $i = 0;
        $elements = array();
        foreach($body->getElementsByTagName('*') as $child) {
            if($child->tagName !== 'html' && $child->tagName !== 'body' && $child->parentNode->tagName === 'body') {
                array_push($elements, $formattedBody->importNode($child, true));
            }
        }
        $sections = sortEntrySections($elements);
        return $sections;
    }

    $body = <<<EOD
<p>Pre Header Section Content 1</p>
<p>Pre Header Section Content 2</p>
<p>Pre Header Section Content 3</p>
<h1>Header 1</h1>
<p>Header 1 Section Content 1</p>
<p>Header 1 Section Content 2</p>
<p>Header 1 Section Content 3</p>
<h2>Header 1.1</h2>
<p>Header 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Section Content 3</p>
<h3>Header 1.1.1</h3>
<p>Header 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Section Content 3</p>
<h4>Header 1.1.1.1</h4>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Section Content 3</p>
<h5>Header 1.1.1.1.1</h5>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 3</p>
<h6>Header 1.1.1.1.1.1</h6>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Section Content 3</p>
<h6>Header 1.1.1.1.1.2</h6>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 3</p>
<h6>Header 1.1.1.1.1.3</h6>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 3</p>
<h5>Header 1.1.1.1.2</h5>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 2 Section Content 3</p>
<h5>Header 1.1.1.1.3</h5>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 1 Subheader 3 Section Content 3</p>
<h4>Header 1.1.1.2</h4>
<p>Header 1 Subheader 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 2 Section Content 3</p>
<h4>Header 1.1.1.3</h4>
<p>Header 1 Subheader 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 1 Subheader 3 Section Content 3</p>
<h3>Header 1.1.2</h3>
<p>Header 1 Subheader 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 2 Section Content 3</p>
<h3>Header 1.1.3</h3>
<p>Header 1 Subheader 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 1 Subheader 3 Section Content 3</p>
<h2>Header 1.2</h2>
<p>Header 1 Subheader 2 Section Content 1</p>
<p>Header 1 Subheader 2 Section Content 2</p>
<p>Header 1 Subheader 2 Section Content 3</p>
<h2>Header 1.3</h2>
<p>Header 1 Subheader 3 Section Content 1</p>
<p>Header 1 Subheader 3 Section Content 2</p>
<p>Header 1 Subheader 3 Section Content 3</p>
<h1>Header 2</h1>
<p>Header 2 Section Content 1</p>
<p>Header 2 Section Content 2</p>
<p>Header 2 Section Content 3</p>
<h1>Header 3</h1>
<p>Header 3 Section Content 1</p>
<p>Header 3 Section Content 2</p>
<p>Header 3 Section Content 3</p>
EOD;
    $libraryEntry = array('body' => $body);

    $results = breakupEntry();

    echo '<textarea>'; var_dump($results); echo '</textarea>';
?>