Question

Google每次尝试抓取时都会阻止我的服务器IP。就像我们制作一个Bulk Google PR检查器脚本一样，谷歌将在接近3k或5k请求后阻止我们的IP。因此，为了给我们的用户最好的结果，我们需要通过一些代理发送cURL请求。不仅谷歌，但我们需要它通过一些代理发送请求。所以，如果有任何身体知道那么请告诉过程，否则闭嘴问这个偏离主题。如何从代理发送cURL请求，而不是从我的服务器IP发送？或者告诉我更好的方法来刮取谷歌数据？

使用cURL的合法方式是什么？

Answer 1

使用选项curl_setopt($ch, CURLOPT_PROXY, $proxy);

 //scape.php

class Scraper {

public function scrape($target_url) {
    $this->target_url = $target_url;

    $ch = curl_init();
    $proxy = $this->_getProxy();
    $userAgent = $this->_getUserAgent();
    if ($proxy) {
        curl_setopt($ch, CURLOPT_PROXY, $proxy);
     }

    curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
    curl_setopt($ch, CURLOPT_URL, $target_url);
    curl_setopt($ch, CURLOPT_FAILONERROR, true);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_TIMEOUT, 10);
    $html = curl_exec($ch);
    curl_close($ch);

    if (!$html) {
        echo 'url/curl error';
        return false;
    }

    $this->html = $html;
    $this->_ParseData();
}



public function setProxy($proxy) {
    $this->proxy = $proxy;
}

private function _getProxy() {
    if (isset($this->proxy))
        return $this->proxy;
    else
        return false;
}

public function setUserAgent($agent) {
    $this->agent = $agent;
}

private function _getUserAgent() {
    if (isset($this->agent))
        return $this->agent;
    else
        return false;
}

 //Parsing data
private function _parseData() {

    $dom = new DOMDocument();
    @$dom->loadHTML($this->html);
    $xpath = new DOMXPath($dom);
    // your xpath query here
    $elements = $xpath->query("//div[@id='ires']");

}

}

使用示例

require 'scrape.php';
$scraper=new Scraper;
$scraper->setProxy('127.0.0.1:9150');
$data=$scraper->scrape('https://www.google.com/#q=stack+overflow');

从代理IP - PHP发送cURL请求

1 个答案: