php卷曲爬行的奇怪超时

时间:2013-10-31 09:32:28

标签: php curl web-crawler

嘿试图运行一个curl脚本来抓取一个网站,但它不断超时。它在本地工作,但不在此服务器上。据我所知,一切都正确设置。

请注意,我已将此代码示例替换为XXXXX的网站全名。

    <?php
error_reporting(E_ALL);
$url = "http://xxxxx.eu/search?f=b" ;

$header = getHeaders(0);
$request = array();
$request['url'] = $url ;
$request['method'] = 'get' ;
$request['header'] = $header ;
echo getPageCURL($request);

die();


function getHeaders($content_length=0)
{

    $header = array();
    $header[] = "Host: xxxxx.eu";
    $header[] = "Accept-Language: en-us,en;q=0.5";
    $header[] = "Accept-Encoding: gzip, deflate";
    $header[] = "User-Agent: Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0";
    $header[] = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

    if($content_length>10) $header[] = "Content Size: ".$content_length;
    $header[] = "Content-Type: text/html; charset=utf-8";  
    $header[] = "Content-Encoding: gzip"; 
    $header[] = "Vary: Accept-Encoding"; 
    $header[] = "Content-Length: 22";
    $header[] = "Connection: keep-alive";


    return $header;
}


function getPageCURL($request) 
{

    $page = '';
    $verified = '';
    $page_type = 'O';
    $filter = true;
    $page_header = 0;
    $followlocation = true;
    $cookies = true;
    $bad_url = false;
    if(is_array($request))
    {
        foreach($request as $request_key=>$request_value)
            $$request_key = $request_value;
    }
    else $url = $request;
    if(empty($url)) return '';
    $url = str_replace(' ', '+', $url);

    //echo  "\nCalling : ".$url;

    for($i=1; $i<=10; $i++)
    {
        $curl = curl_init();
        if(isset($header) && is_array($header))
            curl_setopt($curl, CURLOPT_HTTPHEADER, $header); 
        if(isset($referer) && !empty($referer))
            curl_setopt($curl, CURLOPT_REFERER, $referer);
        if(isset($ssl))
            curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
        if(isset($method) && $method=='post')
        {
            curl_setopt($curl, CURLOPT_POST, true);
            if(isset($post_data) && $post_data!='')
            {
                curl_setopt($curl, CURLOPT_POSTFIELDS, $post_data);
            }
        }

        if($cookies)
        {
            curl_setopt($curl, CURLOPT_COOKIEFILE, 'cookie.txt');
            curl_setopt($curl, CURLOPT_COOKIEJAR, 'cookie.txt');
        }

        curl_setopt($curl, CURLOPT_TIMEOUT, 120);

        curl_setopt($curl, CURLOPT_URL, $url);                          
        curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate'); 
        curl_setopt($curl, CURLOPT_HEADER, $page_header);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
        curl_setopt($curl, CURLOPT_FOLLOWLOCATION, $followlocation);


        $page = curl_exec($curl);


        if(($page===false || trim($page) == '' || empty($page)) && (curl_errno($curl) == 6 || curl_errno($curl) == 7))
        {
            curl_close ($curl);
            //echo "\nNetwork problem...";
            sleep(10);
            $i--;
            continue;
        }
        else if(curl_errno($curl) == 6)
        {
            file_put_contents('cu_failed-'.time().'.txt', $page, FILE_APPEND);
        }


        curl_close ($curl);
    }
    $page = str_replace(array("\n", "\r", "\t"), " ", $page);
    return $page;
}


?>

如果我尝试在相关网站上做一个基本的wget,我会得到这个回复:

root@vps38132:~# wget http://xxxxx.eu
--2013-10-31 10:30:02--  http://xxxxx.eu/
Resolving xxxxxx.eu (xxxxxx.eu)... 31.7.58.171, 31.7.58.172, 31.7.58.170
Connecting to xxxxxx.eu (xxxxxx.eu)|31.7.58.171|:80...

任何想法?我能够卷曲/忘记其他网站,而不是这个发生了什么?

1 个答案:

答案 0 :(得分:1)

进行curl调用时,需要进行身份验证。所以你在curl初始化之后下面的代码。     curl_setopt($ curl,CURLOPT_HTTPAUTH,CURLAUTH_BASIC);     curl_setopt($ curl,CURLOPT_USERPWD,'username:password');

如果需要更多细节,请告诉我。