Question

我有一个解析链接标记页面的脚本，但是有多种方法可以下载页面（wget，file_get_contents，curl等等），并且有多种方法可以包含一个favicon，脚本正在变大。

有简洁的方法吗？也许是可以使用的API？

以下是不断增长的脚本：

<?php

// Use a direct GET request for debugging, just pass in the domain ( ?domain=test.com )
if($_GET)
{
    $obj = new FaviconFinder();
    $obj->invokeDebug($_GET);
}

class FaviconFinder
{
    // domain before and after redirects
    private $domain;
    private $real_domain;

    // the file and how it was obtained
    private $file_code = '0';
    private $file_page;

    // the favicon and how it was obtained
    private $favicon_code = 'z';
    private $file_favicon;
    private $ext;

    // paths local to server and on the internet (URL)
    private $path_local1 = "../../favicons/";
    private $path_local;
    private $path_internet;

/****************************************************************************************************
invokeTest
****************************************************************************************************/

    public function invokeTest($pipe)
    {
        exec('wget ' . $pipe['domain'] . ' -O ../sites/temp.html 2>&1', $output);
        print_r($output);
    }

/****************************************************************************************************
invokeDebug
****************************************************************************************************/

    public function invokeDebug($pipe)
    {

        echo "<br><br> domain: " . $pipe['domain'] . "";
        $pipe = $this->invoke($pipe);
        echo "<br><br> real_domain: " . $this->real_domain . "";
        echo "<br><br> file_code | " . $this->file_code;
        echo "<br><br> favicon_code | " . $this->favicon_code;
        echo "<br><br> favicon_path | " . $this->path_internet;
        echo "<br><br> favicon_file | " . $this->file_favicon;
        echo "<br><br> favicon_file type | " . gettype($this->file_favicon);
        echo "<br><br> favicon_file length | " . strlen($this->file_favicon);
        echo "<br><br> IMAGE: ";
        if ($this->file_favicon)
        {
            echo "<br><br> path_local | " . $this->path_local . "<br><br>";
            $file64 = base64_encode($this->file_favicon);
            echo "<img src= 'data:image/" . $this->ext . ";base64," . $file64 . "'></img>";
        }
        echo "<br><br>";
    }

/****************************************************************************************************
invoke
****************************************************************************************************/

    public function invoke( $pipe )
    {
        $domain = $pipe['domain'];
        if ( $this->pageFound($domain) && $this->linkFound() && $this->faviconFoundFromLink() )
        {
            $pipe = $this->saveFavicon($pipe);
            $pipe['favicon'] = $this->path_internet;
            $pipe['favicon_local'] = $this->path_local;
        } else {
            $pipe['favicon'] = 'NULL';
            $pipe['favicon_local'] = 'image_generic.png';
        }
        $pipe['method'] = $this->file_code . $this->favicon_code;
        return $pipe;
    }

/****************************************************************************************************
pageFound - uses the facade pattern to find a page and record how it was found
****************************************************************************************************/

    private function pageFound ($domain) 
    {
        return $this->pageFoundCurl($domain) || $this->pageFoundGet($domain);
    }

    // wget is another way to get past login page
    // https://stackoverflow.com/questions/1324421/how-to-get-past-the-login-page-with-wget

    // uses curl_exec to retreive a page
    private function pageFoundCurl ($domain)
    {
        $types = array(
            "curl - 4"=>'https://www.' . $domain, 
            "curl - 3"=>'http://www.' . $domain,
            "curl - 6"=>'https://' . $domain,
            "curl - 5"=>'http://' . $domain,

            // returned 302 errors for test.com
            "curl - 1"=>$domain, 
            "curl - 2"=>'www.' . $domain
        );

        foreach ($types as $key => $value) {
            $this->file_page = $this->curlExec($value, true);
            if ($this->file_page)
            {
                $this->file_code = $key;
                return true;
            }
        }
        return false;
    }

    // uses file_get_contents to retreive a page
    private function pageFoundGet( $domain )
    {
        $types = array(
            "file_get - 3"=>'http://www.' . $domain,
            "file_get - 4"=>'https://www.' . $domain, 
            "file_get - 5"=>'http://' . $domain,
            "file_get - 6"=>'https://' . $domain,
            "file_get - 1"=>$domain, 
            "file_get - 2"=>'www.' . $domain
        );

        foreach ($types as $key => $value) {
            if ($this->file_page = $this->fileGetContents( $value ))
            {
                $this->file_code = $key;
                return true;
            }
        }
        return false;
    }

/****************************************************************************************************
linkFound
****************************************************************************************************/

    private function linkFound()
    {
        $domain = $this->real_domain;
        $regex = '#<link\s+(?=[^>]*rel=(?:\'|")(?:shortcut\s)?icon(?:\'|")\s*)(?:[^>]*href=(?:\'|")(.+?)(?:\'|")).*>#i';
        $link_found = preg_match( $regex , $this->file_page, $matches );
        if($link_found === 1)
        {
            $path = $matches[1];

            // handles ( // )
            if ( $path[0] === '/' && $path[1] === '/' )
            {
                $this->favicon_code = 'a';
                $this->path_internet = 'http:' . $path;
            }

            // handles ( / )
            else if( $path[0] === '/' )
            {
                $this->favicon_code = 'b';
                $this->path_internet = 'http://www.' . $domain . $path;
            }

            // handles ( http:// || https:// )
            else if ( substr($path, 0, 4) === 'http' )
            {
                $this->favicon_code = 'c';
                $this->path_internet = $path;
            }

            // difference between b and d?
            else
            {
                $this->favicon_code = 'd';
                $this->path_internet = 'http://www.' . $domain . '/' . $path;
            }
        }
        else
        {
            $default_location = 'http://www.' . $domain . '/favicon.ico';

            /*
            if( $this->faviconFound($default_location) )
            {
                $this->favicon_code = 'e';
                $this->path_internet = $default_location;
            }
            */

            $this->path_internet = null;
            $this->favicon_code = 'g';
            return false;

        }
        return true;
    }

/****************************************************************************************************
faviconFound
****************************************************************************************************/

    private function faviconFoundFromLink () 
    {
        $this->file_favicon = $this->faviconFoundFacade( $this->path_internet );
        return $this->file_favicon ? true : false;
    }

    private function faviconFound ($default_location) 
    {
        $this->file_favicon = $this->faviconFoundFacade( $default_location );
        return $this->file_favicon ? true : false;
    }

/****************************************************************************************************
More
****************************************************************************************************/

    private function faviconFoundFacade($url)
    {
        return $this->faviconFoundCurl($url) ;  
    }

    private function faviconFoundExec($url)
    {
        exec('wget ' . $url . ' -O ../sites/temp.html 2>&1', $output);
    }

    private function faviconFoundGet($url)
    {
        return @file_get_contents( $url );
    }

    // make less than 10 characters equate to false so I don't save bogus files
    // prisonexp.org does this
    // bestbuy.com does similar
    private function faviconFoundCurl($url)
    {
        $temp = $this->curlExec( $url, false );
        if($temp === false)
        {
            return false;
        }
        if(strlen($temp) < 20) 
        {
            return false;
        }
        return $temp;
    }

/****************************************************************************************************
saveFavicon
****************************************************************************************************/

    public function saveFavicon( $pipe )
    {

        // this will remove any query parameters on the favicon link
        // and create a valid file name from the real domain
        $arr = parse_url($this->path_internet);
        $this->ext = pathinfo($arr['path'], PATHINFO_EXTENSION);
        $name = str_replace('.', '_', $this->real_domain);

        // add the extension if it exists, verify you need to to do this
        if ($this->ext) {
            $name = $name . "." . $this->ext;
        }

        // finally save it
        file_put_contents($this->path_local1 . $name, $this->file_favicon);
        $this->path_local = $name;
        return $pipe;
    }

/****************************************************************************************************
helper and wrapper functions
****************************************************************************************************/

    // curl_exec wrapper    
    private function curlExec ($url, $set)
    {
        $curl = curl_init();
        curl_setopt_array($curl, array(
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
        ));
        $temp = curl_exec($curl);
        if ($set) $this->setRealDomain($curl);
        curl_close($curl);
        return $temp;
    }
    private function setRealDomain ($curl)
    {
        $url = curl_getinfo( $curl )['url'];
        $url = parse_url($url);
        $url = $url['host'];
        $this->real_domain = preg_replace('#^www\.(.+\.)#i', '$1', $url);
    }

    // deprecated as curl can do everything I need, just in case though
    // https://stackoverflow.com/questions/
    // 6009284/how-do-i-ignore-a-moved-header-with-file-get-contents-in-php
    private function fileGetContents($value)
    {
        $opts = array(
            'http'=>array(
                'follow_location' => true,
                'max_redirects' => 20
            )
        );
        $context = stream_context_create($opts);
        return @file_get_contents( $value, false, $context );
    }

/****************************************************************************************************
removed
****************************************************************************************************/

    private function removed ()
    {
        $res = preg_match('#(.*?)([^\.]*)(\.)([^\.]*)$#', $domain, $matches);
        if($matches[1])
        {
            $main = $matches[2] . $matches[3] . $matches[4]; 
            $default_location = 'http://www.' . $main . '/favicon.ico';
            $this->file_favicon = @file_get_contents( $default_location );
            if( $this->file_favicon )
            {
                $this->path_internet = $default_location;
                $this->favicon_code = 'f';
                return true;
            }
        }        
    }

}

这是前端的一个API。

To check favicon using Google API

Answer 1

没有针对favicons的策略或API。解析HTML，寻找：

<link rel="shortcut icon" href="...">

或只是：

<link rel="icon" href="...">

并提取href属性的值。

如果不存在这样的标记（或者引用的图标不存在），那么检查/favicon.ico（这是1999年在Internet Explorer 5上开始的所有内容）。

此外，iOS（以及某些版本的Android）会搜索包含<link>或rel="apple-touch-icon"的额外rel="apple-touch-icon-precomposed"元素。

其他一切只是猜测和猜测。

另请参阅：https://en.wikipedia.org/wiki/Favicon#History

获取域名优惠的策略是什么？

1 个答案: