我有一个函数,它使用php中的正则表达式返回给定页面的链接, 现在我想查找已找到链接中的每个链接等等....
这是我的代码
function getLinks($url){
$content = file_get_contents($url);
preg_match_all("|<a [^>]+>(.*)</[^>]+>|U", $content, $links, PREG_PATTERN_ORDER);
$l_clean = array();
foreach($links[0] as $link){
$e_link = explode("href",$link);
$e_link = explode("\"",$e_link[1]);
$f_link = $e_link[1];
if( (substr($f_link,0,strlen('javascript:;')) != "javascript:;")){
$sperator = "";
$first = substr($f_link,0,1);
if($first != "/"){
$f_link = "/$f_link";
}
if(substr($f_link,0,7) != "http://"){
$f_link = "http://" . $sperator . $_SERVER['HTTP_HOST'] . $f_link;
}
$f_link = str_replace("///","//",$f_link);
if(!in_array($f_link, $l_clean)){
array_push($l_clean , $f_link);
}
}
}
}
答案 0 :(得分:1)
只需递归执行,并设置深度终止:
function getLinks($url, $depth){
if( --$depth <= 0 ) return;
$content = file_get_contents($url);
preg_match_all("|<a [^>]+>(.*)</[^>]+>|U", $content, $links, PREG_PATTERN_ORDER);
$l_clean = array();
foreach($links[0] as $link){
$e_link = explode("href",$link);
$e_link = explode("\"",$e_link[1]);
$f_link = $e_link[1];
if( (substr($f_link,0,strlen('javascript:;')) != "javascript:;")){
$sperator = "";
$first = substr($f_link,0,1);
if($first != "/"){
$f_link = "/$f_link";
}
if(substr($f_link,0,7) != "http://"){
$f_link = "http://" . $sperator . $_SERVER['HTTP_HOST'] . $f_link;
}
$f_link = str_replace("///","//",$f_link);
if(!in_array($f_link, $l_clean)){
array_push($l_clean , $f_link);
getLinks( $f_link, $depth );
}
}
}
}
$links = getLinks("http://myurl.com", 3);