这是我的代码示例。它只能扫描一个网页并打印此页面上的所有链接。
我需要递归扫描整个网站,并打印本网站所有页面的所有链接。
以下是我班级的一个例子:
<?php
class ParseLinks
{
private $sRootLink;
private $iCountOfPages; //The number of links that recursively prints
private $iCounter = 0;
private $cache = array();
public function __construct($sRootLink, $iCountOfPages)
{
$this->sRootLink = $sRootLink;
$this->iCountOfPages = $iCountOfPages;
}
public function getRootLink()
{
return $this->sRootLink;
}
public function getCountOfPages()
{
return $this->iCountOfPages;
}
public function setRootLink($sRootLink)
{
$this->sRootLink = $sRootLink;
}
public function setCountOfPages($iCountOfPages)
{
$this->iCountOfPages = $iCountOfPages;
}
public function getAllLinks()
{
$this->rec($this->sRootLink);
}
private function rec($link)
{
$this->cache[$link] = true;
$html = file_get_contents($link);
$DOM = new DOMDocument;
@$DOM->loadHTML($html);
$links = $DOM->getElementsByTagName('a');
//-----------------
$sPatternURL = $this->sRootLink;
foreach ($links as $element) {
if($this->iCounter == $this->iCountOfPages)
break;
if($this->startsWith($element->getAttribute("href"), $sPatternURL))
{
echo $element->getAttribute("href") . "<br>";
$this->iCounter++;
//$this->rec($element->getAttribute("href"));
}
}
}
private function startsWith($haystack, $needle)
{
// search backwards starting from haystack length characters from the end
return $needle === "" || strrpos($haystack, $needle, -strlen($haystack)) !== false;
}
}
答案 0 :(得分:0)
如果有人需要,这是我的版本。工作正常。 这是我班级的一个例子: 在入口处,输入站点和要显示的链接数量。
<?php
ini_set('error_reporting', E_ALL);
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ERROR);
set_time_limit(15000);
class ParseLinks
{
private $sRootLink;
private $iCountOfPages;
private $linkArray = array();
private $iDeep;
private $sDomain;
private $sScheme;
public function __construct($sRootLink, $iCountOfPages)
{
$this->sRootLink = $sRootLink;
$this->iCountOfPages = $iCountOfPages;
$this->iDeep = 0;
$this->sDomain = "";
$this->sScheme = "";
}
public function getAllLinks()
{
$this->recParseLinks($this->sRootLink);
$this->printLinks();
$this->saveToCSV();
}
private function printLinks()
{
echo "Web-site: www." . $this->sDomain . "</br>Count of links: " . count($this->linkArray) . "</br></br>";
foreach($this->linkArray as $element)
echo "<a href=\"" . $element . "\">" . $element . "</a>" . "<br>";
}
private function saveToCSV()
{
$fp = fopen("allLinksFromYourSite.csv", "w");
fwrite($fp, "Web-site: $this->sDomain" . PHP_EOL);
fwrite($fp, "Count of links: " . count($this->linkArray) . PHP_EOL . PHP_EOL);
foreach($this->linkArray as $element)
fwrite($fp, $element . PHP_EOL);
fclose($fp);
}
private function recParseLinks($link)
{
if(strlen($link) <= 1)
return;
if($this->iDeep == 0)
{
$d = parse_url($link);
if($d != false)
{
$this->sDomain = $d['host'];
$this->sScheme = $d['scheme'];
}
else
return;
}
$this->iDeep++;
$doc = new DOMDocument();
$doc->loadHTML(file_get_contents($link));
$elements = $doc->getElementsByTagName('a');
foreach($elements as $element)
{
if(count($this->linkArray) >= $this->iCountOfPages)
return;
$links = $element->getAttribute('href');
if($links[0] == '/' || $links[0] == '?')
$links = $this->sScheme . "://" . $this->sDomain . $links;
$p_links = parse_url($links);
if($p_links == FALSE)
continue;
if($p_links["host"] != $this->sDomain)
continue;
if(!$this->linkExists($links) && strlen($links) > 1)
{
$this->linkArray[] = $links;
if($this->iDeep < 4)
{
$this->recParseLinks($links);
}
}
}
$this->iDeep--;
}
private function linkExists($link)
{
foreach($this->linkArray as $element)
if($element == $link)
return true;
return false;
}
}
$parseLinksObject = new ParseLinks('https://yoursite.com/', 3000);
$parseLinksObject->getAllLinks();