我有一个解析链接标记页面的脚本,但是有多种方法可以下载页面(wget,file_get_contents,curl等等),并且有多种方法可以包含一个favicon,脚本正在变大。
有简洁的方法吗?也许是可以使用的API?
以下是不断增长的脚本:
<?php
// Use a direct GET request for debugging, just pass in the domain ( ?domain=test.com )
if($_GET)
{
$obj = new FaviconFinder();
$obj->invokeDebug($_GET);
}
class FaviconFinder
{
// domain before and after redirects
private $domain;
private $real_domain;
// the file and how it was obtained
private $file_code = '0';
private $file_page;
// the favicon and how it was obtained
private $favicon_code = 'z';
private $file_favicon;
private $ext;
// paths local to server and on the internet (URL)
private $path_local1 = "../../favicons/";
private $path_local;
private $path_internet;
/****************************************************************************************************
invokeTest
****************************************************************************************************/
public function invokeTest($pipe)
{
exec('wget ' . $pipe['domain'] . ' -O ../sites/temp.html 2>&1', $output);
print_r($output);
}
/****************************************************************************************************
invokeDebug
****************************************************************************************************/
public function invokeDebug($pipe)
{
echo "<br><br> domain: " . $pipe['domain'] . "";
$pipe = $this->invoke($pipe);
echo "<br><br> real_domain: " . $this->real_domain . "";
echo "<br><br> file_code | " . $this->file_code;
echo "<br><br> favicon_code | " . $this->favicon_code;
echo "<br><br> favicon_path | " . $this->path_internet;
echo "<br><br> favicon_file | " . $this->file_favicon;
echo "<br><br> favicon_file type | " . gettype($this->file_favicon);
echo "<br><br> favicon_file length | " . strlen($this->file_favicon);
echo "<br><br> IMAGE: ";
if ($this->file_favicon)
{
echo "<br><br> path_local | " . $this->path_local . "<br><br>";
$file64 = base64_encode($this->file_favicon);
echo "<img src= 'data:image/" . $this->ext . ";base64," . $file64 . "'></img>";
}
echo "<br><br>";
}
/****************************************************************************************************
invoke
****************************************************************************************************/
public function invoke( $pipe )
{
$domain = $pipe['domain'];
if ( $this->pageFound($domain) && $this->linkFound() && $this->faviconFoundFromLink() )
{
$pipe = $this->saveFavicon($pipe);
$pipe['favicon'] = $this->path_internet;
$pipe['favicon_local'] = $this->path_local;
} else {
$pipe['favicon'] = 'NULL';
$pipe['favicon_local'] = 'image_generic.png';
}
$pipe['method'] = $this->file_code . $this->favicon_code;
return $pipe;
}
/****************************************************************************************************
pageFound - uses the facade pattern to find a page and record how it was found
****************************************************************************************************/
private function pageFound ($domain)
{
return $this->pageFoundCurl($domain) || $this->pageFoundGet($domain);
}
// wget is another way to get past login page
// https://stackoverflow.com/questions/1324421/how-to-get-past-the-login-page-with-wget
// uses curl_exec to retreive a page
private function pageFoundCurl ($domain)
{
$types = array(
"curl - 4"=>'https://www.' . $domain,
"curl - 3"=>'http://www.' . $domain,
"curl - 6"=>'https://' . $domain,
"curl - 5"=>'http://' . $domain,
// returned 302 errors for test.com
"curl - 1"=>$domain,
"curl - 2"=>'www.' . $domain
);
foreach ($types as $key => $value) {
$this->file_page = $this->curlExec($value, true);
if ($this->file_page)
{
$this->file_code = $key;
return true;
}
}
return false;
}
// uses file_get_contents to retreive a page
private function pageFoundGet( $domain )
{
$types = array(
"file_get - 3"=>'http://www.' . $domain,
"file_get - 4"=>'https://www.' . $domain,
"file_get - 5"=>'http://' . $domain,
"file_get - 6"=>'https://' . $domain,
"file_get - 1"=>$domain,
"file_get - 2"=>'www.' . $domain
);
foreach ($types as $key => $value) {
if ($this->file_page = $this->fileGetContents( $value ))
{
$this->file_code = $key;
return true;
}
}
return false;
}
/****************************************************************************************************
linkFound
****************************************************************************************************/
private function linkFound()
{
$domain = $this->real_domain;
$regex = '#<link\s+(?=[^>]*rel=(?:\'|")(?:shortcut\s)?icon(?:\'|")\s*)(?:[^>]*href=(?:\'|")(.+?)(?:\'|")).*>#i';
$link_found = preg_match( $regex , $this->file_page, $matches );
if($link_found === 1)
{
$path = $matches[1];
// handles ( // )
if ( $path[0] === '/' && $path[1] === '/' )
{
$this->favicon_code = 'a';
$this->path_internet = 'http:' . $path;
}
// handles ( / )
else if( $path[0] === '/' )
{
$this->favicon_code = 'b';
$this->path_internet = 'http://www.' . $domain . $path;
}
// handles ( http:// || https:// )
else if ( substr($path, 0, 4) === 'http' )
{
$this->favicon_code = 'c';
$this->path_internet = $path;
}
// difference between b and d?
else
{
$this->favicon_code = 'd';
$this->path_internet = 'http://www.' . $domain . '/' . $path;
}
}
else
{
$default_location = 'http://www.' . $domain . '/favicon.ico';
/*
if( $this->faviconFound($default_location) )
{
$this->favicon_code = 'e';
$this->path_internet = $default_location;
}
*/
$this->path_internet = null;
$this->favicon_code = 'g';
return false;
}
return true;
}
/****************************************************************************************************
faviconFound
****************************************************************************************************/
private function faviconFoundFromLink ()
{
$this->file_favicon = $this->faviconFoundFacade( $this->path_internet );
return $this->file_favicon ? true : false;
}
private function faviconFound ($default_location)
{
$this->file_favicon = $this->faviconFoundFacade( $default_location );
return $this->file_favicon ? true : false;
}
/****************************************************************************************************
More
****************************************************************************************************/
private function faviconFoundFacade($url)
{
return $this->faviconFoundCurl($url) ;
}
private function faviconFoundExec($url)
{
exec('wget ' . $url . ' -O ../sites/temp.html 2>&1', $output);
}
private function faviconFoundGet($url)
{
return @file_get_contents( $url );
}
// make less than 10 characters equate to false so I don't save bogus files
// prisonexp.org does this
// bestbuy.com does similar
private function faviconFoundCurl($url)
{
$temp = $this->curlExec( $url, false );
if($temp === false)
{
return false;
}
if(strlen($temp) < 20)
{
return false;
}
return $temp;
}
/****************************************************************************************************
saveFavicon
****************************************************************************************************/
public function saveFavicon( $pipe )
{
// this will remove any query parameters on the favicon link
// and create a valid file name from the real domain
$arr = parse_url($this->path_internet);
$this->ext = pathinfo($arr['path'], PATHINFO_EXTENSION);
$name = str_replace('.', '_', $this->real_domain);
// add the extension if it exists, verify you need to to do this
if ($this->ext) {
$name = $name . "." . $this->ext;
}
// finally save it
file_put_contents($this->path_local1 . $name, $this->file_favicon);
$this->path_local = $name;
return $pipe;
}
/****************************************************************************************************
helper and wrapper functions
****************************************************************************************************/
// curl_exec wrapper
private function curlExec ($url, $set)
{
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
));
$temp = curl_exec($curl);
if ($set) $this->setRealDomain($curl);
curl_close($curl);
return $temp;
}
private function setRealDomain ($curl)
{
$url = curl_getinfo( $curl )['url'];
$url = parse_url($url);
$url = $url['host'];
$this->real_domain = preg_replace('#^www\.(.+\.)#i', '$1', $url);
}
// deprecated as curl can do everything I need, just in case though
// https://stackoverflow.com/questions/
// 6009284/how-do-i-ignore-a-moved-header-with-file-get-contents-in-php
private function fileGetContents($value)
{
$opts = array(
'http'=>array(
'follow_location' => true,
'max_redirects' => 20
)
);
$context = stream_context_create($opts);
return @file_get_contents( $value, false, $context );
}
/****************************************************************************************************
removed
****************************************************************************************************/
private function removed ()
{
$res = preg_match('#(.*?)([^\.]*)(\.)([^\.]*)$#', $domain, $matches);
if($matches[1])
{
$main = $matches[2] . $matches[3] . $matches[4];
$default_location = 'http://www.' . $main . '/favicon.ico';
$this->file_favicon = @file_get_contents( $default_location );
if( $this->file_favicon )
{
$this->path_internet = $default_location;
$this->favicon_code = 'f';
return true;
}
}
}
}
这是前端的一个API。
答案 0 :(得分:0)
没有针对favicons的策略或API。解析HTML,寻找:
<link rel="shortcut icon" href="...">
或只是:
<link rel="icon" href="...">
并提取href
属性的值。
如果不存在这样的标记(或者引用的图标不存在),那么检查/favicon.ico
(这是1999年在Internet Explorer 5上开始的所有内容)。
此外,iOS(以及某些版本的Android)会搜索包含<link>
或rel="apple-touch-icon"
的额外rel="apple-touch-icon-precomposed"
元素。
其他一切只是猜测和猜测。