DOM HTML清洁问题

时间:2014-08-20 08:15:37

标签: php html dom tags strip

我一直在尝试输入不整洁的HTML并通过删除不需要的标签和属性来清理它。我只输出空白HTML作为输出,没有任何行。在下面发布我的代码..请帮帮我。

的index.php

<?php
    //for converting tga to png or jpg
    ini_set("memory_limit", "150M"); 
?>
<? ob_start(); ?>
<?php

    /*
        @name: Raja Gopal
        @author: Raja Gopal
        @discribe: Set Infotech Tool
        @version: 1.2 Beta
    */

    //file downloading after saving in "recodeit" folder
    function download($file) {
        if(file_exists($file)) {
            if(ob_get_level()) {
              ob_end_clean();
            }

            header('Content-Description: File Transfer');
            header('Content-Type: application/octet-stream');
            header('Content-Disposition: attachment; filename=' . basename($file));
            header('Content-Transfer-Encoding: binary');
            header('Expires: 0');
            header('Cache-Control: must-revalidate');
            header('Pragma: public');
            header('Content-Length: ' . filesize($file));

            readfile($file); 

        }
    }

    /*
        MAIN
    */  

    if(count($_POST) && isset($_POST['submit'])) {  

        if(!empty($_POST['type'])) {

            //upload file to the server
            if(!empty($_FILES['files']['tmp_name'])) {
                $dir = 'Original/';
                $upload_dir = $dir . basename($_FILES['files']['name']);
                $upload_dir = str_replace(" ", "", $upload_dir);
                if(copy($_FILES['files']['tmp_name'], $upload_dir)) {
                    $flag = true;
                } else {
                    $flag = false;
                }
            } 

            //get file format
            $str[] = $_FILES['files']['name'];
            $str = htmlspecialchars(implode('', $str));
            $str_arr = explode('.', $str);
            $result[] = $str_arr[count($str_arr)-1];
            $result = implode('', $result);
            $result = strtolower($result);
            $result = str_replace(" ", "", $result);
            //$result = trim($result);

            //get file name with dots
            $st[] = $_FILES['files']['name'];
            $st = htmlspecialchars(implode('', $st));
            $str_arr = explode('.', $st);
            $i = 0;
            $flname = '';
            while($i < count($str_arr)-1) {
                if($i == count($str_arr)-2) {
                    $flname .= $str_arr[$i];
                } else {
                    $flname .= $str_arr[$i].'.';
                }
                $i++;
            }

            $flname = str_replace(" ", "", $flname);
            //array with extensions
            $arr = array();
            $arr['html'] = array('html');


            //format check
            if($flag && array_key_exists($result, $arr)) {
                $from = trim($result);
                $to = $_POST['type'];
                $format_res = $flname . '.' . $from;
                $class = $from . $to;

                //add classes from "classes" folder 
                if(require_once('classes/' . $class . '.php')) {
                    $fileclass = new $class($format_res, $flname); 


                    //use our function to save the file
                    download('Recode/' . $flname . '.' . $to);

                    //delete Original folder
                    if(file_exists('Original/' . $format_res)) 
                        unlink('Original/' . $format_res);

                    //clear Recode folder
                    if(file_exists('Recode/'.$flname . '.' . $to)) 
                        unlink('Recode/'.$flname . '.' . $to);
                    exit();
                }   
            }
        }
    }

?>
<? ob_start(); ?>
<!DOCTYPE html>
<html>
<head>
    <title>HTML Cleaning Tool : Set Infotech Pvt Ltd</title>
    <link href="style/style.css" rel="stylesheet">
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <!--works faster becouse file is cached-->
    <script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js"></script>
    <script type="text/javascript" src="js/jquery-1.8.1.js"></script>
    <script type="text/javascript">
        //extensions select
        $('.activated').live('click', function() {
            $('.type_selected').removeClass('type_selected');
            $('.input_type[value="' + $(this).val() + '"]').prop('checked',true);
            $(this).addClass('type_selected');
        });

    </script>
    <script type="text/javascript">

        $(document).ready(function() {
            var filename;
            var extensions = [];
            var extension;
            //array with our extensions
            extensions['html'] = ['html'];



            $('input[type="file"]').change(function(e) {
                // Deselect the error message \ successful recoding
                $('.type_selected').removeClass('type_selected');
                $("#message").removeClass("visible").addClass("hidden");
                //get file name and extention
                var filepath = e.target.value.split('\\');
                filename = filepath[filepath.length-1].split('.');
                extension = filename[filename.length-1];
                $('.file_type').not('.deactivated').removeClass('activated').addClass('deactivated');
                //show possible extensions
                if(extensions[extension.toLowerCase()] !== undefined) {
                    $.each(extensions[extension.toLowerCase()], function(k,v) {
                        $('.' + v).removeClass('deactivated').addClass('activated');
                    });
                }
                //show tick 
                $("#validation").css({
                    "background": "url('img/true.png') no-repeat"
                });
                //show cross
                if(filename.length == 1) {
                    $("#validation").css({
                        "background-image": "url('img/false.png')"
                    });
                }
            });
            //submit event
            $('.submit').click(function() {
                var text;
                //message about the wrong extension
                if(filename === undefined) {
                    $("#message").removeClass("hidden").addClass("visible");
                    $("#message").css({
                        "border": "2px solid #9c3232",
                        "background-color": "#d59e9e"
                    });
                    text = "<center>Your book is not loaded! Please select a book to start conversion!</center>";
                    $("#message").html(text);
                    $('.file_type').not('.deactivated').removeClass('activated').addClass('deactivated');
                    return false;
                }
                //message that there is no extension 
                if($('input[type="radio"]:checked').length==0) {
                    $("#message").removeClass("hidden").addClass("visible");
                    $("#message").css({
                        "border": "2px solid #9c3232",
                        "background-color": "#d59e9e"
                    });
                    text = "<center>You have to select the extension!</center>";
                    $("#message").html(text);
                    return false;   
                }
                //message about successful conversion
                if(filename !== undefined && $('input[type="radio"]:checked').length>0) {
                    $("#message").css({
                        "border": "2px solid #2e8856",
                        "background-color": "#5abd68"
                    })
                    $("#message").removeClass("hidden").addClass("visible");
                    text = "<center>Success ! Pleat Wait</center>";
                    $("#message").html(text);
                    $('.type_selected').removeClass('type_selected');
                    $.each(extensions[extension.toLowerCase()], function(k,v) {
                        $('.' + v).removeClass('activated').addClass('deactivated');
                    });
                    $("#validation").css({
                        "background-image": "url('img/false.png')"
                    });

                    setTimeout(function(){
                        $('input[type="file"]').val('');
                    }, 3000);

                    return true;
                }
            });

        }); 
    </script>
</head>
<body>
    <div class="hole_wrap">
        <!--WRAP CONTENT-->
        <div id="content_wrap">

            <!--BEGIN CONTENT-->
            <div id="content">

                <!--HEADER-->
                <div id="header">
                        <div id="block" class="f_l">
                            <div id="head_text"><h1>HTML Cleaning Tool</h1></div>
                            <div id="text"><p>Set Infotech Pvt Ltd</p></div>
                        </div>
                    <div id="logo" class="f_r"></div>
                    <div class="clearfix"></div>
                </div>
                <!--END HEADER-->
                <!--FORM BEGIN-->   
                <form class="form" name="f" method="POST" enctype="multipart/form-data" target="_blank">
                    <div id="loading_block" class="f_l">

                        <div id="head_text"><h1><center>1. Load Untidy HTML to Clean:</center></h1></div>
                        <!-- file load -->
                        <div class="upload_b f_l">
                            <input id="loading_f" class="files_load" type="file" name="files" size="10">
                        </div>
                        <div id="validation" class="f_r"></div>
                        <div class="clearfix"></div>
                        <div class="gr_line"></div>
                        <div id="head_text"><h1><center>2. Select HTML Below:</center></h1></div>
                        <!-- radios -->
                        <div class="formats f_l">
                            <center><input class="input_type" type="radio" name="type" value="html">
                        <!-- buttons -->
                            <button class="file_type html deactivated" type="button" value="html"></button></center>

                        </div>
                        <div class="clearfix"></div>
                        <!-- exec -->
                        <div class="gr_line"></div>
                        <div id="head_text"><h1><center>3. Clean HTML !</center></h1></div>
                        <input class="submit f_l" id="loader" type="submit" name="submit" value="Convert">
                        <div id="message" class="hidden f_l"><center></center></div>
                    </div>
                    <div class="clearfix"></div>
                </form>
                <!--END OF FORM-->
            </div>
            <!--END OF CONTENT-->
        </div>
        <!--END OF CONTENT WRAP-->
    </div>
</body>
</html>

htmlhtml.php

<?php
class htmlhtml
{
    /** @var string */
    private $tag;
    /** @var string */
    private $attribute;



    private $dom;

    public function __construct($format_res, $flname)
    {

        // Turn up error reporting
        error_reporting(E_ALL | E_STRICT);


        // Upload template
        $this->data = file_get_contents('Original/' . $format_res);


        $this->dom = new DOMDocument();
        $this->dom->strictErrorChecking = false;
        $this->dom->formatOutput = true;
        $this->dom->loadHTML(base64_decode($this->data));

        $exceptions = array(
            'a'   => array('href'),
            'img' => array('src')
        );

        $this->stripAttributes($exceptions);
        $this->stripSpanTags();

        $decoded = base64_decode($this->data);
        $decoded = $this->stripNonBreakingSpaces($decoded);

        file_put_contents('Recode/' . $flname . '.html', $decoded);
    }

    public function stripAttributes(array $exceptions)
    {
        $xpath = new DOMXPath($this->dom);
        if (false === ($elements = $xpath->query("//*"))) die('Xpath error!');

        /** @var $element DOMElement */
        foreach ($elements as $element) {
            for ($i = $element->attributes->length; --$i >= 0;) {
                $this->tag       = $element->nodeName;
                $this->attribute = $element->attributes->item($i)->nodeName;

                if ($this->checkAttrExceptions($exceptions)) continue;

                $element->removeAttribute($this->attribute);
            }
        }

        $this->data = base64_encode($this->dom->saveHTML());
    }

    public function checkAttrExceptions(array $exceptions)
    {
        foreach ($exceptions as $tag => $attributes) {
            if (empty($attributes) || !is_array($attributes)) {
                die('Attributes not set!');
            }

            foreach ($attributes as $attribute) {
                if ($tag === $this->tag && $attribute === $this->attribute) {
                    return true;
                }
            }
        }

        return false;
    }


    /**
     * Strip SPAN tags from current DOM document
     *
     * @return void
     */
    /**
     * Strip SPAN tags from current DOM document
     *
     * @return void
     */
    protected function stripSpanTags ()
    {
        $nodes = $this->dom->getElementsByTagName('span');

        while ($span = $nodes->item(0)) {
            $replacement = $this->dom->createDocumentFragment();
            while ($inner = $span->childNodes->item(0)) {
                $replacement->appendChild($inner);
            }
            $span->parentNode->replaceChild($replacement, $span);
        }

        $this->data = base64_encode($this->dom->saveHTML());
    }


    /**
     * Replace all &nbsp; entities within a string with a regular space
     *
     * @param string $string Input string
     *
     * @return string
     */
    protected function stripNonBreakingSpaces ($string)
    {
        return str_replace('&nbsp;', ' ', $string);
    }


}

0 个答案:

没有答案