使用php检索网页的内容

时间:2014-03-12 08:45:10

标签: php html

<?php


function html2txt($document)
{
    $search=array('@<script[^>]*?>.*?</script>@si','@<[\/!]*?[^<>]*?>@si','@<style[^>]*?>.*?</style>@siU','@<![\s\S]*?--[\t\n\r]*>@');
    $text=preg_replace($search,' ',$document);
    return $text;
}

//http://www.metacritic.com/game/pc/battlefield-4/critic-reviews
$doc=file_get_contents("Battlefield4.htm");
$result=html2txt($doc);
//echo $result;


$fh=fopen("striptags.txt","w");
    fwrite($fh,$result);
    fclose($fh);    





  $original_file = file_get_contents("BattleField4.htm");
  $stripped_file = strip_tags($original_file, "<a>");
  $x=array();
  preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file, $matches);
  $x=$matches[1];


  //DEBUGGING

  //$matches[0] now contains the complete A tags; ex: <a href="link">text</a>
  //$matches[1] now contains only the HREFs in the A tags; ex: link

  //header("Content-type: text/plain"); //Set the content type to plain text so the print below is easy to read!
  //if(preg_match_all("/game\/pc\/i",$matches,$matches2)
 // print_r($x); //View the array to see if it worked
 $t=$_GET['t1'];
//print $t."\n";


/*
  for($i=0;$i<sizeof($x);$i++)
  {

    print $x[$i]."\n";

  }
  //print_r($matches);
  //print_r($matches2);
  */

  for($i=0;$i<sizeof($x);$i++)
  {
        if(1 === preg_match("/(?:$t)/i",$x[$i],$matches) || preg_match("/download/i",$x[$i],$matches))
        {
        //print $x[$i]."\n";
        file_put_contents('links.txt', $x[$i]."\n", FILE_APPEND);
        }

  }



$lines=file('positivecomments.txt');
$lines=array_unique($lines);
file_put_contents('positivecomments.txt',implode($lines));



//to match positive comments
$text = file('striptags.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$words = file('positive.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$pattern = implode('|', $words);


$x=0;
file_put_contents('positivecomments.txt', "Positive comments:\n", FILE_APPEND);


foreach ($text as $string) 
{
    if (1 === preg_match("/(?:$pattern)/i", $string))
    {
        //$string2=implode('\n',$string);
        file_put_contents('positivecomments.txt',$x."\t".$string."\n", FILE_APPEND);
        $x++;
        $lines=file('links.txt');
        $lines=array_unique($lines);
        file_put_contents('links.txt',implode($lines));

    }
}
file_put_contents('positivecomments.txt', "\nTotal Positive comments are:\n".$x, FILE_APPEND);

$lines=file('positivecomments.txt');
$lines=array_unique($lines);
file_put_contents('positivecomments.txt',implode($lines));


//to match negative comments
$text = file('striptags.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$words = file('negative.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$pattern = implode('|', $words);


file_put_contents('positivecomments.txt', "\n\nNegative comments:\n", FILE_APPEND);
foreach ($text as $string) {
    if (1 === preg_match("/(?:$pattern)/i", $string))
    {
        //$string2=implode('\n',$string);
        file_put_contents('positivecomments.txt',$y."\t".$string."\n", FILE_APPEND);
        $y++;
    }
}
file_put_contents('positivecomments.txt', "\nTotal Negative comments are:\n".$y, FILE_APPEND);


if($x>$y)
{
$op1="\nThis game has more positive points so it is recommended to download\n";
file_put_contents('positivecomments.txt', $op1, FILE_APPEND);   
}
else
{
$op2="\nThis game has more negative points so it is not recommended to download\n";
file_put_contents('positivecomments.txt', $op2, FILE_APPEND);   
}



//to match comments with parameters
$text = file('positivecomments.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$words = file('parameters.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$pattern = implode('|', $words);

$x1=0;
file_put_contents('positivecomments.txt', "\nComments with parameters:\n", FILE_APPEND);
foreach ($text as $string) 
{
    if (1 === preg_match("/(?:$pattern)/i", $string))
    {
        //$string2=implode('\n',$string);
        file_put_contents('positivecomments.txt',"\t".$string."\n", FILE_APPEND);
        $x1++;
    }
}
file_put_contents('positivecomments.txt', "\nTotal comments are:\n".$x1."\n", FILE_APPEND);

$lines=file('positivecomments.txt');
$lines=array_unique($lines);
file_put_contents('positivecomments.txt',implode($lines));

header("Location: http://localhost/crawler/interface.php");
//header("Content-type: text/html");
?>

这是我的php文件,它从网页中提取内容,但问题是它还提取了css标签。我想跳过所有的脚本和样式标签。我已经在html2txt函数中编写了它,但它不起作用。所以任何人都可以帮助这个。

0 个答案:

没有答案