嗨!我制作了以下代码来解析网站并获取其中的所有链接。但我刚刚发现我的代码只获得了第三层内的链接。我想深入解析并获得第四层内的所有链接。我怎样才能做到这一点?我需要添加什么东西才能深入解析网站? TIA!
*注意:第1层=红色;第二=黄色; 3 =绿色;第四=蓝色;请看下面的图片。
<?php
$c = array();
$url = "http://www.nytimes.com";
function get_links($url)
{
global $c;
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTMLFile($url);
libxml_clear_errors();
$base_url = parse_url($url, PHP_URL_HOST);
// fetching all stylesheet
foreach( $doc->getElementsByTagName('link') as $style){
$href = $style->getAttribute('href');
if (substr($href,0,2) == "//") {
$href = substr($href, 2);
}
if (strpos($href, "#")) {
$href = substr($href, 0, strpos($href, "#"));
}
if (strpos($href, "?")) {
$href = substr($href, 0, strpos($href, "?"));
}
if (!in_array($href, $c)) {
array_push($c, $href);
}
}
// fetching all href
foreach( $doc->getElementsByTagName('a') as $a){
$href = $a->getAttribute('href');
if (strpos($href, "#")) {
$href = substr($href, 0, strpos($href, "#"));
}
if (strpos($href, "?")) {
$href = substr($href, 0, strpos($href, "?"));
}
if (substr($href,0,1) == ".") {
$href = substr($href, 1);
}
if (substr($href,0,7) == "http://") {
$href = $href;
}
else if (substr($href,0,8) == "https://") {
$href = $href;
}
else if (substr($href,0,2) == "//") {
$href = substr($href, 2);
}
else if (substr($href,0,1) == "#") {
$href = $url;
}
else if (substr($href,0,7) == "mailto:") {
$href = "[".$href."]";
}
else {
if (substr($href, 0, 1) != "/") {
$href = $base_url."/".$href;
}
else {
$href = $base_url.$href;
}
}
if (substr($href, 0, 7) != "http://" && substr($href, 0, 8) != "https://" && substr($href, 0, 1) != "[") {
if (substr($href, 0, 8) == "https://") {
$href = "https://".$href;
}
else {
$href = "http://".$href;
}
}
if (!in_array($href, $c)) {
array_push($c, $href);
}
}
// fetching all image
foreach( $doc->getElementsByTagName('img') as $image){
$href = $image->getAttribute('src');
if (substr($href,0,2) == "//") {
$href = substr($href, 2);
}
if (strpos($href, "#")) {
$href = substr($href, 0, strpos($href, "#"));
}
if (strpos($href, "?")) {
$href = substr($href, 0, strpos($href, "?"));
}
if (!in_array($href, $c)) {
array_push($c, $href);
}
}
// fetching all script
foreach( $doc->getElementsByTagName('script') as $scripts){
$href = $scripts->getAttribute('src');
if (substr($href,0,2) == "//") {
$href = substr($href, 2);
}
if (strpos($href, "#")) {
$href = substr($href, 0, strpos($href, "#"));
}
if (strpos($href, "?")) {
$href = substr($href, 0, strpos($href, "?"));
}
if (!in_array($href, $c)) {
array_push($c, $href);
}
}
}
get_links($url);
var_dump($c);
?>