<?php
function html2txt($document)
{
$search=array('@<script[^>]*?>.*?</script>@si','@<[\/!]*?[^<>]*?>@si','@<style[^>]*?>.*?</style>@siU','@<![\s\S]*?--[\t\n\r]*>@');
$text=preg_replace($search,' ',$document);
return $text;
}
//http://www.metacritic.com/game/pc/battlefield-4/critic-reviews
$doc=file_get_contents("Battlefield4.htm");
$result=html2txt($doc);
//echo $result;
$fh=fopen("striptags.txt","w");
fwrite($fh,$result);
fclose($fh);
$original_file = file_get_contents("BattleField4.htm");
$stripped_file = strip_tags($original_file, "<a>");
$x=array();
preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file, $matches);
$x=$matches[1];
//DEBUGGING
//$matches[0] now contains the complete A tags; ex: <a href="link">text</a>
//$matches[1] now contains only the HREFs in the A tags; ex: link
//header("Content-type: text/plain"); //Set the content type to plain text so the print below is easy to read!
//if(preg_match_all("/game\/pc\/i",$matches,$matches2)
// print_r($x); //View the array to see if it worked
$t=$_GET['t1'];
//print $t."\n";
/*
for($i=0;$i<sizeof($x);$i++)
{
print $x[$i]."\n";
}
//print_r($matches);
//print_r($matches2);
*/
for($i=0;$i<sizeof($x);$i++)
{
if(1 === preg_match("/(?:$t)/i",$x[$i],$matches) || preg_match("/download/i",$x[$i],$matches))
{
//print $x[$i]."\n";
file_put_contents('links.txt', $x[$i]."\n", FILE_APPEND);
}
}
$lines=file('positivecomments.txt');
$lines=array_unique($lines);
file_put_contents('positivecomments.txt',implode($lines));
//to match positive comments
$text = file('striptags.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$words = file('positive.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$pattern = implode('|', $words);
$x=0;
file_put_contents('positivecomments.txt', "Positive comments:\n", FILE_APPEND);
foreach ($text as $string)
{
if (1 === preg_match("/(?:$pattern)/i", $string))
{
//$string2=implode('\n',$string);
file_put_contents('positivecomments.txt',$x."\t".$string."\n", FILE_APPEND);
$x++;
$lines=file('links.txt');
$lines=array_unique($lines);
file_put_contents('links.txt',implode($lines));
}
}
file_put_contents('positivecomments.txt', "\nTotal Positive comments are:\n".$x, FILE_APPEND);
$lines=file('positivecomments.txt');
$lines=array_unique($lines);
file_put_contents('positivecomments.txt',implode($lines));
//to match negative comments
$text = file('striptags.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$words = file('negative.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$pattern = implode('|', $words);
file_put_contents('positivecomments.txt', "\n\nNegative comments:\n", FILE_APPEND);
foreach ($text as $string) {
if (1 === preg_match("/(?:$pattern)/i", $string))
{
//$string2=implode('\n',$string);
file_put_contents('positivecomments.txt',$y."\t".$string."\n", FILE_APPEND);
$y++;
}
}
file_put_contents('positivecomments.txt', "\nTotal Negative comments are:\n".$y, FILE_APPEND);
if($x>$y)
{
$op1="\nThis game has more positive points so it is recommended to download\n";
file_put_contents('positivecomments.txt', $op1, FILE_APPEND);
}
else
{
$op2="\nThis game has more negative points so it is not recommended to download\n";
file_put_contents('positivecomments.txt', $op2, FILE_APPEND);
}
//to match comments with parameters
$text = file('positivecomments.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$words = file('parameters.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$pattern = implode('|', $words);
$x1=0;
file_put_contents('positivecomments.txt', "\nComments with parameters:\n", FILE_APPEND);
foreach ($text as $string)
{
if (1 === preg_match("/(?:$pattern)/i", $string))
{
//$string2=implode('\n',$string);
file_put_contents('positivecomments.txt',"\t".$string."\n", FILE_APPEND);
$x1++;
}
}
file_put_contents('positivecomments.txt', "\nTotal comments are:\n".$x1."\n", FILE_APPEND);
$lines=file('positivecomments.txt');
$lines=array_unique($lines);
file_put_contents('positivecomments.txt',implode($lines));
header("Location: http://localhost/crawler/interface.php");
//header("Content-type: text/html");
?>
这是我的php文件,它从网页中提取内容,但问题是它还提取了css标签。我想跳过所有的脚本和样式标签。我已经在html2txt函数中编写了它,但它不起作用。所以任何人都可以帮助这个。