网站管理员工具Api抓取优化

时间:2015-01-29 17:06:53

标签: google-webmaster-tools

使用网站管理员工具 API 解析来自 google 的数据。下载速度很慢,有没有办法优化,或者使用其他方法。根据数据,可能需要超过一小时。

问候。

const HOST = "https://www.google.com";
const SERVICEURI = "/webmasters/tools/";

    public function __construct()
    {
        $this->_auth = $this->_loggedIn = $this->_domain = false;
        $this->_data = array();
    }

    public function getArray($domain)
    {
        if ($this->_validateDomain($domain)) {
            if ($this->_prepareData()) {
                return $this->_data;
            } else {
                throw new Exception('Error receiving crawl issues for ' . $domain);
            }
        } else {
            throw new Exception('The given domain is not connected to your Webmastertools account!');
            exit;
        }
    }

    public function getCsv($domain, $localPath = false)
    {
        if ($this->_validateDomain($domain)) {
            if ($this->_prepareData()) {
                if (!$localPath) {
                    $this->_HttpHeaderCSV();
                    $this->_outputCSV();
                } else {
                    $this->_outputCSV($localPath);
                }
            } else {
                throw new Exception('Error receiving crawl issues for ' . $domain);
            }
        } else {
            throw new Exception('The given domain is not connected to your Webmastertools account!');
            exit;
        }
    }

    public function getSites()
    {
        if ($this->_loggedIn) {
            $feed = $this->_getData('feeds/sites/');
            if ($feed) {
                $doc = new DOMDocument();
                $doc->loadXML($feed);

                $sites = array();
                foreach ($doc->getElementsByTagName('entry') as $node) {
                    array_push($sites, $node->getElementsByTagName('title')->item(0)->nodeValue);
                }

                return (0 < sizeof($sites)) ? $sites : false;
            } else {
                return false;
            }
        } else {
            return false;
        }
    }

    public function login($mail, $pass)
    {
        $postRequest = array(
            'accountType' => 'HOSTED_OR_GOOGLE',
            'Email' => $mail,
            'Passwd' => $pass,
            'service' => "sitemaps",
            'source' => "Google-WMTdownloadscript-0.11-php"
        );

        // Before PHP version 5.2.0 and when the first char of $pass is an @ symbol, 
        // send data in CURLOPT_POSTFIELDS as urlencoded string.
        if ('@' === (string) $pass[0] || version_compare(PHP_VERSION, '5.2.0') < 0) {
            $postRequest = http_build_query($postRequest);
        }

        $ch = curl_init(self::HOST . '/accounts/ClientLogin');
        curl_setopt_array($ch, array(
            CURLOPT_RETURNTRANSFER => 1,
            CURLOPT_CONNECTTIMEOUT => 30,
            CURLOPT_SSL_VERIFYPEER => 0,
            CURLOPT_FOLLOWLOCATION => 1,
            CURLOPT_POST => 1,
            CURLOPT_POSTFIELDS => $postRequest,
        ));

        $output = curl_exec($ch);
        $info   = curl_getinfo($ch);
        curl_close($ch);

        if (200 != $info['http_code']) {
            throw new Exception('Login failed!');
            exit;
        } else {
            @preg_match('/Auth=(.*)/', $output, $match);
            if (isset($match[1])) {
                $this->_auth     = $match[1];
                $this->_loggedIn = true;
                return true;
            } else {
                throw new Exception('Login failed!');
                exit;
            }
        }
    }

    private function _prepareData()
    {
        if ($this->_loggedIn) {
            $currentIndex = 1;
            $maxResults   = 100;

            $encUri = urlencode($this->_domain);

            /*
             * Get the total result count / result page count
             */
            $feed = $this->_getData("feeds/{$encUri}/crawlissues?start-index=1&max-results=1");
            if (!$feed) {
                return false;
            }

            $doc = new DOMDocument();
            $doc->loadXML($feed);

            $totalResults = (int) $doc->getElementsByTagNameNS('http://a9.com/-/spec/opensearch/1.1/', 'totalResults')->item(0)->nodeValue;
            $resultPages  = (0 != $totalResults) ? ceil($totalResults / $maxResults) : false;

            unset($feed, $doc);

            if (!$resultPages) {
                return false;
            }

            /*
             * Paginate over issue feeds
             */
            else {
                // Csv data headline
                $this->_data = Array(
                    Array(
                        'Issue Id',
                        'Crawl type',
                        'Issue type',
                        'Detail',
                        'URL',
                        'Date detected',
                        'Last detected'
                    )
                );

                while ($currentIndex <= $resultPages) {
                    $startIndex = ($maxResults * ($currentIndex - 1)) + 1;

                    $feed = $this->_getData("feeds/{$encUri}/crawlissues?start-index={$startIndex}&max-results={$maxResults}");
                    $doc  = new DOMDocument();
                    $doc->loadXML($feed);

                    foreach ($doc->getElementsByTagName('entry') as $node) {
                        $issueId      = str_replace(self::HOST . self::SERVICEURI . "feeds/{$encUri}/crawlissues/", '', $node->getElementsByTagName('id')->item(0)->nodeValue);
                        $crawlType    = $node->getElementsByTagNameNS('http://schemas.google.com/webmasters/tools/2007', 'crawl-type')->item(0)->nodeValue;
                        $issueType    = $node->getElementsByTagNameNS('http://schemas.google.com/webmasters/tools/2007', 'issue-type')->item(0)->nodeValue;
                        $detail       = $node->getElementsByTagNameNS('http://schemas.google.com/webmasters/tools/2007', 'detail')->item(0)->nodeValue;
                        $url          = $node->getElementsByTagNameNS('http://schemas.google.com/webmasters/tools/2007', 'url')->item(0)->nodeValue;
                        $dateDetected = date('d/m/Y', strtotime($node->getElementsByTagNameNS('http://schemas.google.com/webmasters/tools/2007', 'date-detected')->item(0)->nodeValue));
                        $updated      = date('d/m/Y', strtotime($node->getElementsByTagName('updated')->item(0)->nodeValue));

                        // add issue data to results array
                        array_push($this->_data, Array(
                            $issueId,
                            $crawlType,
                            $issueType,
                            $detail,
                            $url,
                            $dateDetected,
                            $updated
                        ));
                    }

                    unset($feed, $doc);
                    $currentIndex++;
                }
                return true;
            }
        } else {
            return false;
        }
    }

    private function _getData($url)
    {
        if ($this->_loggedIn) {
            $header = array(
                'Authorization: GoogleLogin auth=' . $this->_auth,
                'GData-Version: 2'
            );

            $ch = curl_init(self::HOST . self::SERVICEURI . $url);
            curl_setopt_array($ch, array(
                CURLOPT_RETURNTRANSFER => 1,
                CURLOPT_CONNECTTIMEOUT => 30,
                CURLOPT_SSL_VERIFYPEER => 0,
                CURLOPT_FOLLOWLOCATION => 1,
                CURLOPT_ENCODING => 1,
                CURLOPT_HTTPHEADER => $header
            ));

            $result = curl_exec($ch);
            $info   = curl_getinfo($ch);
            curl_close($ch);

            return (200 != $info['http_code']) ? false : $result;
        } else {
            return false;
        }
    }

    private function _HttpHeaderCSV()
    {
        header('Content-type: text/csv; charset=utf-8');
        header('Content-disposition: attachment; filename=gwt-crawlerrors-' . $this->_getFilename());
        header('Pragma: no-cache');
        header('Expires: 0');
    }

    private function _outputCSV($localPath = false)
    {
        $outstream = !$localPath ? 'php://output' : $localPath . DIRECTORY_SEPARATOR . $this->_getFilename();
        $outstream = fopen($outstream, "w");
        if (!function_exists('__outputCSV')) {
            function __outputCSV(&$vals, $key, $filehandler)
            {
                fputcsv($filehandler, $vals); // add parameters if you want
            }
        }
        array_walk($this->_data, "__outputCSV", $outstream);
        fclose($outstream);
    }

    private function _getFilename()
    {
        return 'gwt-crawlerrors-' . parse_url($this->_domain, PHP_URL_HOST) . '-' . date('Ymd-His') . '.csv';
    }

    private function _validateDomain($domain)
    {
        if (!filter_var($domain, FILTER_VALIDATE_URL)) {
            return false;
        }

        $sites = $this->getSites();
        if (!$sites) {
            return false;
        }

        foreach ($sites as $url) {
            if (parse_url($domain, PHP_URL_HOST) == parse_url($url, PHP_URL_HOST)) {
                $this->_domain = $domain;
                return true;
            }
        }

        return false;
    }

1 个答案:

答案 0 :(得分:0)

来自Google的网址错误Google已经仔细添加了错误的扩展程序,您会看到它 谷歌废话!
您的网站可能很好。