我有一个包含自由文本的内容数据库,大约有11000行数据,每行有87列。因此(可能)有大约957000个字段来检查URL是否有效。
我做了一个正则表达式来提取所有看起来像URL(http / s等)的东西,并建立了一个名为$ urls的数组。然后,我遍历它,将每个$ url传递给我的curl_exec()调用。
我尝试了cURL(每个$ url):
$ch = curl_init();
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS, 250);
curl_setopt($ch, CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECT_ONLY, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_HTTPGET, 1);
foreach ($urls as $url) {
curl_setopt($ch, CURLOPT_URL, $url);
$exec = curl_exec($ch);
// Extra stuff here... it does add overhead, but not that much.
}
curl_close($ch);
据我所知,此功能应尽快运行,但每个URL大约需要2-3秒。
必须有一种更快的方法吗?
我计划通过cron作业运行此文件,然后先检查我的本地数据库(如果过去30天内已检查过此URL),如果没有,则进行检查,因此随着时间的流逝,这个数量会减少,但是我只是想知道cURL是否是最好的解决方案,以及我是否缺少使它更快的东西?
编辑: 根据下面的评论bby Nick Zulu,我现在坐在这段代码中:
function ODB_check_url_array($urls, $debug = true) {
if (!empty($urls)) {
$mh = curl_multi_init();
foreach ($urls as $index => $url) {
$ch[$index] = curl_init($url);
curl_setopt($ch[$index], CURLOPT_CONNECTTIMEOUT_MS, 10000);
curl_setopt($ch[$index], CURLOPT_NOBODY, 1);
curl_setopt($ch[$index], CURLOPT_FAILONERROR, 1);
curl_setopt($ch[$index], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch[$index], CURLOPT_CONNECT_ONLY, 1);
curl_setopt($ch[$index], CURLOPT_HEADER, 1);
curl_setopt($ch[$index], CURLOPT_HTTPGET, 1);
curl_multi_add_handle($mh, $ch[$index]);
}
$running = null;
do {
curl_multi_exec($mh, $running);
} while ($running);
foreach ($ch as $index => $response) {
$return[$ch[$index]] = curl_multi_getcontent($ch[$index]);
curl_multi_remove_handle($mh, $ch[$index]);
curl_close($ch[$index]);
}
curl_multi_close($mh);
return $return;
}
}
答案 0 :(得分:2)
让我们看看。
使用curl_multi api(这是在PHP中执行此操作的唯一明智选择)
具有最大同时连接数限制,不要只为每个url创建一个连接(如果仅创建一百万个同时连接,则会出现内存不足或资源不足的错误。如果您只是同时创建了一百万个连接,我什至都不相信超时错误)
仅获取标头,因为下载正文会浪费时间和带宽
这是我的尝试:
// if return_fault_reason is false, then the return is a simple array of strings of urls that validated.
// otherwise it's an array with the url as the key containing array(bool validated,int curl_error_code,string reason) for every url
function validate_urls(array $urls, int $max_connections, int $timeout_ms = 10000, bool $consider_http_300_redirect_as_error = true, bool $return_fault_reason) : array
{
if ($max_connections < 1) {
throw new InvalidArgumentException("max_connections MUST be >=1");
}
foreach ($urls as $key => $foo) {
if (!is_string($foo)) {
throw new \InvalidArgumentException("all urls must be strings!");
}
if (empty($foo)) {
unset($urls[$key]); //?
}
}
unset($foo);
$urls = array_unique($urls); // remove duplicates.
$ret = array();
$mh = curl_multi_init();
$workers = array();
$work = function () use (&$ret, &$workers, &$mh, &$return_fault_reason) {
// > If an added handle fails very quickly, it may never be counted as a running_handle
while (1) {
curl_multi_exec($mh, $still_running);
if ($still_running < count($workers)) {
break;
}
$cms=curl_multi_select($mh, 10);
//var_dump('sr: ' . $still_running . " c: " . count($workers)." cms: ".$cms);
}
while (false !== ($info = curl_multi_info_read($mh))) {
//echo "NOT FALSE!";
//var_dump($info);
{
if ($info['msg'] !== CURLMSG_DONE) {
continue;
}
if ($info['result'] !== CURLM_OK) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, $info['result'], "curl_exec error " . $info['result'] . ": " . curl_strerror($info['result']));
}
} elseif (CURLE_OK !== ($err = curl_errno($info['handle']))) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, $err, "curl error " . $err . ": " . curl_strerror($err));
}
} else {
$code = (string)curl_getinfo($info['handle'], CURLINFO_HTTP_CODE);
if ($code[0] === "3") {
if ($consider_http_300_redirect_as_error) {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, -1, "got a http " . $code . " redirect, which is considered an error");
}
} else {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(true, 0, "got a http " . $code . " redirect, which is considered a success");
} else {
$ret[] = $workers[(int)$info['handle']];
}
}
} elseif ($code[0] === "2") {
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(true, 0, "got a http " . $code . " code, which is considered a success");
} else {
$ret[] = $workers[(int)$info['handle']];
}
} else {
// all non-2xx and non-3xx are always considered errors (500 internal server error, 400 client error, 404 not found, etcetc)
if ($return_fault_reason) {
$ret[$workers[(int)$info['handle']]] = array(false, -1, "got a http " . $code . " code, which is considered an error");
}
}
}
curl_multi_remove_handle($mh, $info['handle']);
assert(isset($workers[(int)$info['handle']]));
unset($workers[(int)$info['handle']]);
curl_close($info['handle']);
}
}
//echo "NO MORE INFO!";
};
foreach ($urls as $url) {
while (count($workers) >= $max_connections) {
//echo "TOO MANY WORKERS!\n";
$work();
}
$neww = curl_init($url);
if (!$neww) {
trigger_error("curl_init() failed! probably means that max_connections is too high and you ran out of resources", E_USER_WARNING);
if ($return_fault_reason) {
$ret[$url] = array(false, -1, "curl_init() failed");
}
continue;
}
$workers[(int)$neww] = $url;
curl_setopt_array($neww, array(
CURLOPT_NOBODY => 1,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_TIMEOUT_MS => $timeout_ms
));
curl_multi_add_handle($mh, $neww);
//curl_multi_exec($mh, $unused_here); LIKELY TO BE MUCH SLOWER IF DONE IN THIS LOOP: TOO MANY SYSCALLS
}
while (count($workers) > 0) {
//echo "WAITING FOR WORKERS TO BECOME 0!";
//var_dump(count($workers));
$work();
}
curl_multi_close($mh);
return $ret;
}
这是一些测试代码
$urls = [
'www.example.org',
'www.google.com',
'https://www.google.com',
];
var_dump(validate_urls($urls, 1000, 1, true, false));
返回
array(0) {
}
因为它们全部超时(1毫秒超时),并且失败原因报告被禁用(这是最后一个参数),
$urls = [
'www.example.org',
'www.google.com',
'https://www.google.com',
];
var_dump(validate_urls($urls, 1000, 1, true, true));
返回
array(3) {
["www.example.org"]=>
array(3) {
[0]=>
bool(false)
[1]=>
int(28)
[2]=>
string(39) "curl_exec error 28: Timeout was reached"
}
["www.google.com"]=>
array(3) {
[0]=>
bool(false)
[1]=>
int(28)
[2]=>
string(39) "curl_exec error 28: Timeout was reached"
}
["https://www.google.com"]=>
array(3) {
[0]=>
bool(false)
[1]=>
int(28)
[2]=>
string(39) "curl_exec error 28: Timeout was reached"
}
}
将超时限制增加到1000,我们得到
var_dump(validate_urls($urls, 1000, 1000, true, false));
=
array(3) {
[0]=>
string(14) "www.google.com"
[1]=>
string(22) "https://www.google.com"
[2]=>
string(15) "www.example.org"
}
和
var_dump(validate_urls($urls, 1000, 1000, true, true));
=
array(3) {
["www.google.com"]=>
array(3) {
[0]=>
bool(true)
[1]=>
int(0)
[2]=>
string(50) "got a http 200 code, which is considered a success"
}
["www.example.org"]=>
array(3) {
[0]=>
bool(true)
[1]=>
int(0)
[2]=>
string(50) "got a http 200 code, which is considered a success"
}
["https://www.google.com"]=>
array(3) {
[0]=>
bool(true)
[1]=>
int(0)
[2]=>
string(50) "got a http 200 code, which is considered a success"
}
}
以此类推:)速度应取决于您的带宽和$ max_connections变量,该变量是可配置的。
答案 1 :(得分:1)
这是最快的速度,我可以通过一个小的ping来快速实现它:
$domains = ['google.nl', 'blablaasdasdasd.nl', 'bing.com'];
foreach($domains as $domain){
$exists = null!==shell_exec("ping ".$domain." -c1 -s1 -t1");
echo $domain.' '.($exists?'exists':'gone');
echo '<br />'.PHP_EOL;
}
c->计数(1个就足够了)
s->大小(我们需要1个值)
t->超时->无响应时超时。您可能需要调整这一点。
请记住,某些服务器不响应ping。我不知道这样做的百分比,但是我建议对所有未通过ping检查的用户执行更好的第二次检查,应该会明显减少结果。