加快cURL页面登录和抓取速度

时间:2013-05-02 18:04:22

标签: php curl

我有一个登录网站的功能,并在下一页中搜索字符串。这个过程目前需要10秒钟,但是想看看我能做些什么来加快速度。我想知道是否有可能让curl登录持续在客户端会话上,或者可能更好地搜索文档。

public function curlLogin($url, $post_values, $cookieJar) {

        $timeout = 30;

        $curl_connection = curl_init();
        curl_setopt($curl_connection, CURLOPT_URL, $url);
        curl_setopt($curl_connection, CURLOPT_TIMEOUT, $timeout);
        curl_setopt($curl_connection, CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
        curl_setopt($curl_connection, CURLOPT_COOKIEJAR, $cookieJar);
        curl_setopt($curl_connection, CURLOPT_COOKIEFILE, $cookieJar);
        curl_setopt($curl_connection, CURLOPT_COOKIESESSION, 0);
        curl_setopt($curl_connection, CURLOPT_HEADER, 1);
        curl_setopt($curl_connection, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl_connection, CURLOPT_SSL_VERIFYPEER, 0);
        curl_setopt($curl_connection, CURLOPT_POST, 1);
        curl_setopt($curl_connection, CURLOPT_POSTFIELDS, $post_values);
        curl_setopt($curl_connection, CURLOPT_HTTPHEADER,
        array("Content-type: application/x-www-form-urlencoded"));
        curl_exec($curl_connection);
        return $curl_connection;

    }

    public function curlPost($curl_connection, $url, $post_values, $cookieJar) {

        $timeout = 30;

        curl_setopt($curl_connection, CURLOPT_URL, $url);
        curl_setopt($curl_connection, CURLOPT_TIMEOUT, $timeout);
        curl_setopt($curl_connection, CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
        curl_setopt($curl_connection, CURLOPT_COOKIEJAR, $cookieJar);
        curl_setopt($curl_connection, CURLOPT_COOKIEFILE, $cookieJar);
        curl_setopt($curl_connection, CURLOPT_COOKIESESSION, 0);
        curl_setopt($curl_connection, CURLOPT_HEADER, 1);
        curl_setopt($curl_connection, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($curl_connection, CURLOPT_SSL_VERIFYPEER, 0);
        curl_setopt($curl_connection, CURLOPT_POST, 1);
        curl_setopt($curl_connection, CURLOPT_POSTFIELDS, $post_values);
        curl_setopt($curl_connection, CURLOPT_HTTPHEADER,
        array("Content-type: application/x-www-form-urlencoded"));
        $result = curl_exec($curl_connection);
        return $result;

    }

$cookieJar = tempnam ("/tmp", "CURLCOOKIE");

$curl_connection = $this->curlLogin($login_url, $post_values, $cookieJar);

$result = $this->curlPost($curl_connection, $next_url, $params, $cookieJar);

if (strpos($result,'string 1') > 0) {
    $success = true;
    $message = 'string 1 is present';
}else if (strpos($result,'string 2') > 0){
    $success = false;
    $message = 'string 2 is present';
}else if (strpos($result,'string 3') > 0){
    $success = false;
    $message = 'string 3 is present';
}else{
    $success = false;
    $message = 'None of the above strings are present.';
}

curl_close($curl_connection);
unlink($cookieJar);

1 个答案:

答案 0 :(得分:2)

您可以通过重复使用cookiejar来避免每次都登录。

在包含脚本的目录中创建一个名为cookies.txt的文件,并指定: $cookieJar = 'cookies.txt'

首次运行脚本后,只需删除对curlLogin()函数的调用,curlPost()函数应正确使用cookie并返回数据,就像登录一样。

请记住,CURLOPT_COOKIEFILE用于指定从哪里“读取”Cookie,CURLOPT_COOKIEJAR是您希望写入响应Cookie的位置。

所以你可能在CURLOPT_COOKIEJAR函数中没有curlPost()