如果没有可用的元描述或者不超过10个字符,我的抓取工具有什么可以使用页面内容作为描述?我希望它最多可以包含30个字符。
以下是代码:
<?php
if(!isset($crawlToken) || $crawlToken!=418941){
if(!isset($_GET['78wc58v'])){
die("Error");
}
}
ini_set("display_errors", "on");
$dir=realpath(dirname(__FILE__));
include($dir."/../inc/config.php");
function shutdown(){
global $dir;
$error = error_get_last();
if($error !== NULL && $error['type'] === E_ERROR) {
file_put_contents($dir."/crawlStatus.txt", "0");
get_headers(HOST."/crawler/runCrawl.php");
}
}
set_time_limit(0);
register_shutdown_function('shutdown');
include($dir."/PHPCrawl/libs/PHPCrawler.class.php");
include($dir."/simple_html_dom.php");
function addURL($t, $u, $d){
global $dbh;
if($t!="" && filter_var($u, FILTER_VALIDATE_URL)){
$check=$dbh->prepare("SELECT `id` FROM `search` WHERE `url`=?");
$check->execute(array($u));
$t=preg_replace("/\s+/", " ", $t);
$t=substr($t, 0, 1)==" " ? substr_replace($t, "", 0, 1):$t;
$t=substr($t, -1)==" " ? substr_replace($t, "", -1, 1):$t;
$t=html_entity_decode($t, ENT_QUOTES);
$d=html_entity_decode($d, ENT_QUOTES);
echo $u."<br/>\n";
ob_flush();
flush();
if($check->rowCount()==0){
$sql=$dbh->prepare("INSERT INTO `search` (`title`, `url`, `description`) VALUES (?, ?, ?)");
$sql->execute(array(
$t,
$u,
$d
));
}else{
$sql=$dbh->prepare("UPDATE `search` SET `description` = ?, `title` = ? WHERE `url`=?");
$sql->execute(array(
$d,
$t,
$u
));
}
}
}
class WSCrawler extends PHPCrawler {
function handleDocumentInfo(PHPCrawlerDocumentInfo $p){
$u=$p->url;
$c=$p->http_status_code;
$s=$p->source;
if($c==200 && $s!=""){
$html = str_get_html($s);
if(is_object($html)){
$d="";
$do=$html->find("meta[name=description]", 0);
if($do){
$d=$do->content;
}
$t=$html->find("title", 0);
if($t){
$t=$t->innertext;
addURL($t, $u, $d);
}
$html->clear();
unset($html);
}
}
}
}
function crawl($u){
$C = new WSCrawler();
$C->setURL($u);
$C->addContentTypeReceiveRule("#text/html#");
$C->addURLFilterRule("#(jpg|gif|png|pdf|jpeg|svg|css|js)$# i");
if(!isset($GLOBALS['bgFull'])){
$C->setTrafficLimit(2000 * 1024);
}
$C->obeyRobotsTxt(true);
$C->obeyNoFollowTags(true);
$C->setUserAgentString("Nevo (../about/bot.php)");
$C->setFollowMode(0);
$C->go();
}
if(!isset($url4Array)){
// Get the last indexed URLs (If there isn't, use default URL's) & start Crawling
$last=$dbh->query("SELECT `url` FROM search");
$count=$last->rowCount();
if($count < 1){
crawl("http://localhost"); // The Default URL #1
}else{
$urls=$last->fetchAll();
$index=rand(0, $count-1);
crawl($urls[$index]['url']);
}
}elseif(is_array($url4Array)){
foreach($url4Array as $url){
crawl($url);
}
}
?>
答案 0 :(得分:1)
我在这里猜测,因为我还没有听说过State_CSA_1:
S <= '0'&(A xor B xor 0);
C <= (A and B) or (A and 0) or (B and 0) & '0';
out_ready<='0';--the result is not ready yet
nextstate<=State_CSA_2;
State_CSA_2:
S <= '0'&(S xor C xor F);
C <= (S and C) or (S and F) or (C and F) & '0';
nextstate<=S_out;
S_out:
RES_S<=S;
RES_C<=C;
out_ready<='1';--the result is ready
nextstate<=State_CSA_1;
并且没有把它交给测试。这将取代PHPCrawler
中的描述获取。
handleDocumentInfo
当然,我建议你不要指望这一点逐字逐句。玩弄它,你会得到一些有用的东西。
我也改变了一些变量:$do = $html->find("meta[name=description]", 0);
$description = $do ? $do->content : '';
if (!$description) {
// You'll need to work out how to get a text copy of
// page content - maybe this?
$do = $html->find("body", 0);
$description = $do->content;
}
并不是描述它的作用。
答案 1 :(得分:1)
$do=$html->find("meta[name=description]", 0);
if($do){
$d=$do->content;
} else {
$do = $html->find("p",0); // OR $html->find("h2",0) OR whatever;
$d = substr($do->plaintext,0,30); // just 30 chars
}
如果没有元描述,请先获得“p”或“h2”