我正在使用以下代码检查给定网址中的已损坏链接。但这个过程非常缓慢。我需要快速加快这个过程。
$$url_list = array(
"http://goog528le.com",
"http://facebook.com",
"http://google.com",
"http://youtube.com",
"http://yahoo.com",
"http://amazon.com",
"http://baidu.com",
"http://wikipedia.org",
"http://live.com",
"http://qq.com",
"http://taobao.com",
"http://google.co.in",
"http://twitter.com",
"http://blogspot.com",
"http://yahoo.co.jp",
"http://linkedin.com",
"http://bing.com",
"http://sina.com.cn"
, "http://yandex.ru");
// 1. multi handle
$mh = curl_multi_init();
$max_connections = 10;
$dead_urls = array();
$not_found_urls = array();
// 2. add multiple URLs to the multi handle
for ($i = 0; $i < $max_connections; $i++) {
add_url_to_multi_handle($mh, $url_list);
}
// 3. initial execution
do {
$mrc = curl_multi_exec($mh, $active);
} while($ mrc == CURLM_CALL_MULTI_PERFORM);
// 4.主循环 while($ active&amp;&amp; $ mrc == CURLM_OK){
// 5. there is activity
if (curl_multi_select($mh) != -1) {
// 6. do work
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
// 7. is there info?
if ($mhinfo = curl_multi_info_read($mh)) {
// this means one of the requests were finished
// 8. get the info on the curl handle
$chinfo = curl_getinfo($mhinfo['handle']);
// 9. dead link?
if (!$chinfo['http_code']) {
$dead_urls [] = $chinfo['url'];
// 10. 404?
} else if ($chinfo['http_code'] == 404) {
$not_found_urls [] = $chinfo['url'];
// 11. working
} else {
$working_urls [] = $chinfo['url'];
}
// 12. remove the handle
curl_multi_remove_handle($mh, $mhinfo['handle']);
curl_close($mhinfo['handle']);
// 13. add a new url and do work
if (add_url_to_multi_handle($mh, $url_list)) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
}
}
// 14.完成 curl_multi_close($ MH +);
echo "==Dead URLs==\n";
echo implode("\n", $dead_urls) . "\n\n";
echo "==404 URLs==\n";
echo implode("\n", $not_found_urls) . "\n\n";
echo "==Working URLs==\n";
echo implode("\n", $working_urls);
// 15. adds a url to the multi handle
function add_url_to_multi_handle($mh, $url_list)
{
static $index = 0;
// if we have another url to get
if (isset($url_list[$index]) && $url_list[$index]) {
// new curl handle
$ch = curl_init();
// set the url
curl_setopt($ch, CURLOPT_URL, $url_list[$index]);
// to prevent the response from being outputted
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
// follow redirections
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// do not need the body. this saves bandwidth and time
curl_setopt($ch, CURLOPT_NOBODY, 1);
// add it to the multi handle
curl_multi_add_handle($mh, $ch);
// increment so next url is used next time
$index++;
return true;
} else {
// we are done adding new URLs
return false;
}
}
答案 0 :(得分:3)
解决方案是在完成后立即处理每个请求。这消除了繁忙等待时浪费的CPU周期。创建一个cURL请求队列以允许最大吞吐量也是一个好主意。每次请求完成后,我都会从队列中添加一个新请求。通过动态添加和删除链接,我们始终保持一定数量的链接下载。这为我们提供了一种方法来限制我们发送的同时请求的数量。结果是以更快,更有效的方式并行处理大量cURL请求。
这是一个参考函数:
function rolling_curl($urls, $callback, $custom_options = null) {
// make sure the rolling window isn't greater than the # of urls
$rolling_window = 5;
$rolling_window = (sizeof($urls) < $rolling_window) ? sizeof($urls) : $rolling_window;
$master = curl_multi_init();
$curl_arr = array();
// add additional curl options here
$std_options = array(CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5);
$options = ($custom_options) ? ($std_options + $custom_options) : $std_options;
// start the first batch of requests
for ($i = 0; $i < $rolling_window; $i++) {
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i];
curl_setopt_array($ch,$options);
curl_multi_add_handle($master, $ch);
}
do {
while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if($execrun != CURLM_OK)
break;
// a request was just completed -- find out which one
while($done = curl_multi_info_read($master)) {
$info = curl_getinfo($done['handle']);
if ($info['http_code'] == 200) {
$output = curl_multi_getcontent($done['handle']);
// request successful. process output using the callback function.
$callback($output);
// start a new request (it's important to do this before removing the old one)
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i++]; // increment i
curl_setopt_array($ch,$options);
curl_multi_add_handle($master, $ch);
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
} else {
// request failed. add error handling.
}
}
} while ($running);
curl_multi_close($master);
return true;
}
希望这有帮助!