Question

实现功能不正确，脚本应该得到一个干净的文本，但在这种情况下，他从网站上获取了一些文字和大量不必要的信息

脚本索引：

<?php

include("strip_tags_smart.php");
include("UTF8.php");
include("ReflectionTypehint.php");

/* Read an HTML file */
if(isset($_POST['urls'])){
    include 'connect.php';

    $urls = explode("\r\n", $_POST['urls']);
    for ($i = 0; $i < sizeof($urls); $i++) 
    {
        $url = $urls[$i];
        echo $url."</br>";
        $text = file_get_utf8_contents($url);
        $titre = eregi("<title>(.*)</title>",$text,$regs);
        $title = $regs[1];
        echo $title."</br>";
        $text = strip_tags_smart($text);
        echo mysql_escape_string($url)."</br>title = ";
        echo mysql_escape_string($title)."</br>text=";
        echo mysql_escape_string($text)."</br>";
        echo "</br>".$query;

    }


}
function file_get_utf8_contents($fn) {

    $raw_text = file_get_contents($fn);

    $raw_text = strtolower($raw_text);


    if(UTF8::is_utf8($raw_text))
    {


        $raw_text = UTF8::convert_from($raw_text, 'utf8');

    }
    else 
    {

        $raw_text = UTF8::convert_from($raw_text, 'cp1251');

        }

    return $raw_text;
}
?>
<form action="<?php $_PHP_SELF ?>" method="POST">
urls:</br>
<textarea rows="30" cols="100" wrap="physical" name="urls">

</textarea>

  <input type="submit" value="Submit" />
</form>

sript UTF8：http://rapidshare.com/files/450118274/utf8.php

脚本strip_tags_smart：

<?php
function strip_tags_smart(
    /*string*/ $s,
    array $allowable_tags = null,
    /*boolean*/ $is_format_spaces = true,

    array $pair_tags = array('head', 'SCRIPT', 'script', 'style', 'map', 'iframe', 'frameset', 'object', 'applet', 'comment', 'button', 'textarea', 'select', 'img', 'form', 'noindex','noembed', 'applet'),
    array $para_tags = array('p', 'td', 'th','address','blockquote','center','div','fieldset','isindex','menu','ol','table', 'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'form', 'title', 'pre')
)
{
    //$vowels = array("#","(",")","\"");
    //$s = str_replace($vowels, " ", $s);
    //echo $s;
    //echo "</br>-----------------------------------------------------------------</br>";

    //return strip_tags($s);
    static $_callback_type  = false;
    static $_allowable_tags = array();
    static $_para_tags      = array();
    #regular expression for tag attributes
    #correct processes dirty and broken HTML in a singlebyte or multibyte UTF-8 charset!
    static $re_attrs_fast_safe =  '(?![a-zA-Z\d])  #statement, which follows after a tag
                                   #correct attributes
                                   (?>
                                       [^>"\']+
                                     | (?<=[\=\x20\r\n\t]|\xc2\xa0) "[^"]*"
                                     | (?<=[\=\x20\r\n\t]|\xc2\xa0) \'[^\']*\'
                                   )*
                                   #incorrect attributes
                                   [^>]*+';

    if (is_array($s))
    {
        if ($_callback_type === 'strip_tags')
        {
            $tag = strtolower($s[1]);
            if ($_allowable_tags)
            {
                #tag with attributes
                if (array_key_exists($tag, $_allowable_tags)) return $s[0];

                #tag without attributes
                if (array_key_exists('<' . $tag . '>', $_allowable_tags))
                {
                    if (substr($s[0], 0, 2) === '</') return '</' . $tag . '>';
                    if (substr($s[0], -2) === '/>')   return '<' . $tag . ' />';
                    return '<' . $tag . '>';
                }
            }
            if ($tag === 'br') return "\r\n";
            if ($_para_tags && array_key_exists($tag, $_para_tags)) return "\r\n\r\n";
            return '';
        }
        trigger_error('Unknown callback type "' . $_callback_type . '"!', E_USER_ERROR);
    }

    if (($pos = strpos($s, '<')) === false || strpos($s, '>', $pos) === false)  #speed improve
    {
        #tags are not found
        return $s;
    }

    $length = strlen($s);

    #unpaired tags (opening, closing, !DOCTYPE, MS Word namespace)
    $re_tags = '~  <[/!]?+
                   (
                       [a-zA-Z][a-zA-Z\d]*+
                       (?>:[a-zA-Z][a-zA-Z\d]*+)?
                   ) #1
                   ' . $re_attrs_fast_safe . '
                   >
                ~sxSX';

    $patterns = array(
        '/<([\?\%]) .*? \\1>/sxSX',     #встроенный PHP, Perl, ASP код
        '/<\!\[CDATA\[ .*? \]\]>/sxSX', #блоки CDATA
        #'/<\!\[  [\x20\r\n\t]* [a-zA-Z] .*?  \]>/sxSX',  #:DEPRECATED: MS Word таги типа <![if! vml]>...<![endif]>

        '/<\!--.*?-->/sSX', #комментарии

        #MS Word таги типа "<![if! vml]>...<![endif]>",
        #условное выполнение кода для IE типа z"<!--[if expression]> HTML <![endif]-->"
        #условное выполнение кода для IE типа "<![if expression]> HTML <![endif]>"
        #см. http://www.tigir.com/comments.htm
        '/ <\! (?:--)?+
               \[
               (?> [^\]"\']+ | "[^"]*" | \'[^\']*\' )*
               \]
               (?:--)?+
           >
         /sxSX',
    );
    if ($pair_tags)
    {
        #парные таги вместе с содержимым:
        foreach ($pair_tags as $k => $v) $pair_tags[$k] = preg_quote($v, '/');
        $patterns[] = '/ <((?i:' . implode('|', $pair_tags) . '))' . $re_attrs_fast_safe . '(?<!\/)>
                         .*?
                         <\/(?i:\\1)' . $re_attrs_fast_safe . '>
                       /sxSX';
    }
    #d($patterns);

    $i = 0; #защита от зацикливания
    $max = 99;
    while ($i < $max)
    {
        $s2 = preg_replace($patterns, '', $s);
        if (preg_last_error() !== PREG_NO_ERROR)
        {
            $i = 999;
            break;
        }

        if ($i == 0)
        {
            $is_html = ($s2 != $s || preg_match($re_tags, $s2));
            if (preg_last_error() !== PREG_NO_ERROR)
            {
                $i = 999;
                break;
            }
            if ($is_html)
            {
                if ($is_format_spaces)
                {
                    /*
                    В библиотеке PCRE для PHP \s - это любой пробельный символ, а именно класс символов [\x09\x0a\x0c\x0d\x20\xa0] или, по другому, [\t\n\f\r \xa0]
                    Если \s используется с модификатором /u, то \s трактуется как [\x09\x0a\x0c\x0d\x20]
                    Браузер не делает различия между пробельными символами, друг за другом подряд идущие символы воспринимаются как один
                    */
                    $s2 = str_replace(array("\r", "\n", "\r\n", "\t"), ' ', $s2);
                    $s2 = strtr($s2, "\x09\x0a\x0c\x0d", ' ');
                    $s2 = preg_replace('/  [\x09\x0a\x0c\x0d]++
                                         | <((?i:pre|textarea))' . $re_attrs_fast_safe . '(?<!\/)>
                                           .+?
                                           <\/(?i:\\1)' . $re_attrs_fast_safe . '>
                                           \K
                                        /sxSX', ' ', $s2);
                    if (preg_last_error() !== PREG_NO_ERROR)
                    {
                        $i = 999;
                        break;
                    }
                }

                #массив тагов, которые не будут вырезаны
                if ($allowable_tags) $_allowable_tags = array_flip($allowable_tags);

                #парные таги, которые будут восприниматься как параграфы
                if ($para_tags) $_para_tags = array_flip($para_tags);
            }
        }#if

        #tags processing
        if ($is_html)
        {
            $_callback_type = 'strip_tags';
            $s2 = preg_replace_callback($re_tags, __FUNCTION__, $s2);
            $_callback_type = false;
            if (preg_last_error() !== PREG_NO_ERROR)
            {
                $i = 999;
                break;
            }
        }

        if ($s === $s2) break;
        $s = $s2; $i++;
    }#while
    if ($i >= $max) $s = strip_tags($s); #too many cycles for replace...

    if ($is_format_spaces && strlen($s) !== $length)
    {
        #remove a duplicate spaces
        $s = preg_replace('/\x20\x20++/sSX', ' ', trim($s));

        $s  = preg_replace(
        array(
            // Remove invisible content
            '@\r\n@siu',
            '@[#&].{3,5};@siu',
            '@[&]nbsp@siu',
            '@[`~\@\#\$\%^\&*()=+{}[\]"\'\\|/<>]@siu',

        ),
        array(
            ' ', ' ',
        ),
        $s);
//,


        #remove a spaces before and after new lines
        $s = str_replace(array("\r\n\x20", "\x20\r\n"), "\r\n", $s);
        #replace 3 and more new lines to 2 new lines
        $s = preg_replace('/[\r\n]{3,}+/sSX', "\r\n\r\n", $s);


    }
    return $s;

我需要删除所有标签，只留下在浏览器中看到男人的文字 } ？＆GT;

Answer 1

PHP的strip_tags（） - 函数应该能够处理UTF-8字符串。在我看来，你的问题是解析原始网站HTML得到你感兴趣的部分。这可能很难，因为示例网站（http://lita.ru/health/doctor/）包含非 - 验证HTML。另请注意复制有关版权的第三方内容。

如何删除所有标签？

1 个答案: