我想抓取此链接http://dl2.my98music.com/Data/
最多4-5个级别并打印其中的所有链接但我无法进入第二级,因为链接http://dl2.my98music.com/Data/
以某种方式更改为http://dl2.my98music.com/
删除自动/
之后的部分。我如何阻止这种情况发生?
class crawler
{
protected $_url;
protected $_depth;
protected $_host;
protected $_useHttpAuth = false;
protected $_user;
protected $_pass;
protected $_seen = array();
protected $_filter = array();
public function __construct($url, $depth = 5)
{
$this->_url = $url;
$this->_depth = $depth;
$parse = parse_url($url);
$this->_host = $parse['host'];
}
protected function _processAnchors($content, $url, $depth)
{
$dom = new DOMDocument('1.0');
@$dom->loadHTML($content);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
if (extension_loaded('http')) {
$href = http_build_url($url, array('path' => $path));
} else {
$parts = parse_url($url);
$href = $parts['scheme'] . '://';
if (isset($parts['user']) && isset($parts['pass'])) {
$href .= $parts['user'] . ':' . $parts['pass'] . '@';
}
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .= $path;
}
}
// Crawl only link that belongs to the start domain
$this->crawl_page($href, $depth - 1);
}
}
protected function _getContent($url)
{
$handle = curl_init($url);
if ($this->_useHttpAuth) {
curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass);
}
// follows 302 redirect, creates problem wiht authentication
// curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
// return the content
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
/* Get the HTML or whatever is linked in $url. */
$response = curl_exec($handle);
// response total time
$time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
/* Check for 404 (file not found). */
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
curl_close($handle);
return array($response, $httpCode, $time);
}
protected function _printResult($url, $depth, $httpcode, $time)
{
ob_end_flush();
$currentDepth = $this->_depth - $depth;
$count = count($this->_seen);
echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>";
ob_start();
flush();
}
protected function isValid($url, $depth)
{
if (strpos($url, $this->_host) === false
|| $depth === 0
|| isset($this->_seen[$url])
) {
return false;
}
foreach ($this->_filter as $excludePath) {
if (strpos($url, $excludePath) !== false) {
return false;
}
}
return true;
}
public function crawl_page($url, $depth)
{
if (!$this->isValid($url, $depth)) {
return;
}
// add to the seen URL
$this->_seen[$url] = true;
// get Content and Return Code
list($content, $httpcode, $time) = $this->_getContent($url);
// print Result for current Page
$this->_printResult($url, $depth, $httpcode, $time);
// process subPages
$this->_processAnchors($content, $url, $depth);
}
public function setHttpAuth($user, $pass)
{
$this->_useHttpAuth = true;
$this->_user = $user;
$this->_pass = $pass;
}
public function addFilterPath($path)
{
$this->_filter[] = $path;
}
public function run()
{
$this->crawl_page($this->_url, $this->_depth);
}
}
$startURL = 'http://dl2.my98music.com/Data/';
$depth = 6;
$username = '';
$password = '';
$crawler = new crawler($startURL, $depth);
$crawler->setHttpAuth($username, $password);
// Exclude path with the following structure to be processed
$crawler->addFilterPath('customer/account/login/referer');
$crawler->run();
答案 0 :(得分:1)
错误出现在_processAnchors()
,您需要$href
中的完整路径。
在我改变之后它起作用了:
$path = '/' . ltrim($href, '/');
要:
$path = '/Data/' . ltrim($href, '/');