Question

我的项目需要使用curl multi execute命中存储在数组中的每个用户名的url。用户名数组的大小几乎是45k，到现在为止我创建了另一个45k url数组，我想要点击，然后有效地发送请求我已经将url数组分成每个大小为200的块。然后我将每个chunked数组传递给multi_curl_execute以获得响应，但问题是接收所有45k请求的响应需要太多时间。我已经打印了响应数组并且它按预期不断增加但是打印所有的响应它花费了太多时间。请帮助我，因为我必须在明天之前实现我的目标。我将在下面给出我的代码

$array1=[1,2,3,4,5,6.....45000];

现在使用每个用户名创建url作为查询字符串

foreach($array1 as $arr)
{
$url[]='abc.com?u='.$arr;
}

//创建块

$chunk[]=array_chunk($url,200,true);

//现在发送每个块

for($i=0;$i<sizeof($chunk);$i++)
{
foreach($chunk[$i] as $c_arr)
{
array_push($res,multiRequest($c_arr));
}
}

//我的multi_curl函数

function multiRequest($data,$options = array())
{
$curly = array();
$result = array();
$mh = curl_multi_init();
$ua = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13';
foreach ($data as $id => $d) 
{
$curly[$id]= curl_init();
curl_setopt($curly[$id], CURLOPT_URL,$d);
curl_setopt($curly[$id], CURLOPT_RETURNTRANSFER,true);
curl_setopt($curly[$id], CURLOPT_USERAGENT, $ua);
curl_setopt($curly[$id], CURLOPT_AUTOREFERER, true);
curl_setopt($curly[$id], CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curly[$id], CURLOPT_MAXREDIRS, 20);
curl_setopt($curly[$id], CURLOPT_HTTPGET, true);
curl_setopt($curly[$id], CURLOPT_HEADER,0);
curl_setopt($curly[$id], CURLOPT_RETURNTRANSFER,1);
curl_multi_add_handle($mh, $curly[$id]);
}
$running = null;
do {
curl_multi_exec($mh, $running);
} while($running > 0);

foreach($curly as $id => $c) 
{
$result[$id] = curl_multi_getcontent($c);
curl_multi_remove_handle($mh, $c);
}
curl_multi_close($mh);

return $result;
}

请告诉我应该怎么做，因为它需要将近25-30分钟来传递所有45000个请求的响应。现在我在本地计算机上运行此脚本，而稍后它将被安排为cron作业在实时服务器上

Answer 1

你尝试过多处理而不是curl_multi吗？也许那更快？不会是第一次。

尝试

<?php

$code = <<<'CODE'
<?php
$ch=curl_init();
curl_setopt_array($ch,array(
CURLOPT_URL=>'abc.com?u='.urlencode($argv[1]),
CURLOPT_ENCODING=>"",
CURLOPT_USERAGENT=>'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13',
CURLOPT_AUTOREFERER=>true,
CURLOPT_FOLLOWLOCATION=>true,
CURLOPT_MAXREDIRS=>20
));
curl_exec($ch);
curl_close($ch);

CODE;
$jobFileh = tmpfile ();
$jobFile = stream_get_meta_data ( $jobFileh ) ['uri'];
file_put_contents ( $jobFile, $code );

$jobs = array ();

for($i = 1; $i <= 45000; ++ $i) {
    $jobs [] = '/usr/bin/php ' . escapeshellarg ( $jobFile ) . ' ' . escapeshellarg ( ( string ) $i );
}
$starttime = microtime ( true );
$ret = hhb_exec_multi1 ( $jobs, 200 );
$seconds_used = microtime ( true ) - $starttime;
var_dump ( $ret, $seconds_used );
die ();
class hhb_exec_multi1_ret {
    public $cmd;
    public $ret;
    public $stdout;
    public $stderr;
    function __construct(array $attributes) {
        foreach ( $attributes as $name => $val ) {
            $this->$name = $val;
        }
    }
}
/**
 *
 * @param string[] $cmds
 * @param int $max_concurrent
 * @throws InvalidArgumentException
 * @return hhb_exec_multi1_ret[]
 */
function hhb_exec_multi1(array $cmds, int $max_concurrent = 10, $finished_callback = NULL): array {
    // TODO: more error checking, if proc_create fail, out of ram, tmpfile() fail, etc
    {
        // input validation
        if ($max_concurrent < 1) {
            throw new InvalidArgumentException ( '$max_concurrent must be above 0... and less or equal to' . PHP_INT_MAX );
        }
        foreach ( $cmds as $tmp ) {
            if (! is_string ( $tmp )) {
                throw new InvalidArgumentException ( '$cmds must be an array of strings!' );
            }
        }
    }
    $ret = array ();
    $running = array ();
    foreach ( $cmds as $key => $cmd ) {
        $current = array (
                'cmd' => $cmd,
                'ret' => - 1,
                'stdout' => tmpfile (),
                'stderr' => tmpfile (),
                'key' => $key 
        );
        $pipes = [ ];
        $descriptorspec = array (
                0 => array (
                        "pipe",
                        "rb" 
                ),
                1 => array (
                        "file",
                        stream_get_meta_data ( $current ['stdout'] ) ['uri'],
                        "wb" 
                ),
                2 => array (
                        "file",
                        stream_get_meta_data ( $current ['stderr'] ) ['uri'],
                        "wb" 
                )  // stderr is a file to write to
        );
        while ( count ( $running ) >= $max_concurrent ) {
            // echo ".";
            usleep ( 100 * 1000 );
            foreach ( $running as $runningkey => $check ) {
                $stat = proc_get_status ( $check ['proc'] );
                if ($stat ['running']) {
                    continue;
                }
                proc_close ( $check ['proc'] );
                $check ['ret'] = $stat ['exitcode'];
                $stdout = file_get_contents ( stream_get_meta_data ( $check ['stdout'] ) ['uri'] );
                fclose ( $check ['stdout'] );
                $check ['stdout'] = $stdout;
                $stderr = file_get_contents ( stream_get_meta_data ( $check ['stderr'] ) ['uri'] );
                fclose ( $check ['stderr'] );
                $check ['stderr'] = $stderr;
                $checkkey = $check ['key'];
                unset ( $check ['key'] );
                unset ( $check ['proc'] );
                $tmp = ($ret [$checkkey] = new hhb_exec_multi1_ret ( $check ));
                unset ( $running [$runningkey] );
                if (! empty ( $finished_callback )) {
                    $finished_callback ( $tmp );
                }
            }
        }
        $current ['proc'] = proc_open ( $cmd, $descriptorspec, $pipes );
        fclose ( $pipes [0] ); // do it like this because we don't want the children to inherit our stdin, which is the default behaviour if [0] is not defined.
        $running [] = $current;
    }
    while ( count ( $running ) > 0 ) {
        // echo ",";
        usleep ( 100 * 1000 );
        foreach ( $running as $runningkey => $check ) {
            $stat = proc_get_status ( $check ['proc'] );
            if ($stat ['running']) {
                continue;
            }
            proc_close ( $check ['proc'] );
            $check ['ret'] = $stat ['exitcode'];
            $stdout = file_get_contents ( stream_get_meta_data ( $check ['stdout'] ) ['uri'] );
            fclose ( $check ['stdout'] );
            $check ['stdout'] = $stdout;
            $stderr = file_get_contents ( stream_get_meta_data ( $check ['stderr'] ) ['uri'] );
            fclose ( $check ['stderr'] );
            $check ['stderr'] = $stderr;
            $checkkey = $check ['key'];
            unset ( $check ['key'] );
            unset ( $check ['proc'] );
            $tmp = ($ret [$checkkey] = new hhb_exec_multi1_ret ( $check ));
            unset ( $running [$runningkey] );
            if (! empty ( $finished_callback )) {
                $finished_callback ( $tmp );
            }
        }
    }
    return $ret;
}

当我在笔记本电脑上将此代码运行到本地nginx服务器时，它在6分39秒（399秒）内执行，循环设置为45000.

编辑：wups，忘了将代码写入作业文件（file_put_contents），修复。

curl_multi_exec正在永远地回应

1 个答案: