我正在寻找一种更好的方法来从卷曲的网址中抓取内容,我希望你对多线程或其他想法有所了解。我想保存超过5.000.000个网站的HTML代码
function curl_download($Url){
if (!function_exists('curl_init')){
die('Sorry cURL is not installed!');
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $Url);
curl_setopt($ch, CURLOPT_REFERER, "http://www.url.de/?aktion=suche");
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/1.0");
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$output = curl_exec($ch);
curl_close($ch);
return $output;
}
$i = "1";
while ($i <= 450000)
{
$html = curl_download('http://www.url.de/id='.$i.'&land=be');
mysql_query("INSERT INTO hb (content)
VALUES('$html')");
echo "$i ";
$i++;
}
&#13;
谢谢你的帮助
答案 0 :(得分:-1)
我正在使用多线程卷曲来快速检查许多代理
这是我为您的需求准备的代码
$mc = curl_multi_init();
$i=1;
while ($i <= 450000) {
$thread_no = $i;
$c [$thread_no] = curl_init();
curl_setopt($c [$thread_no], CURLOPT_URL, 'http://www.url.de/id='.$i.'&land=be');
curl_setopt($c [$thread_no], CURLOPT_HEADER, 0);
curl_setopt($c [$thread_no], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($c [$thread_no], CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($c [$thread_no], CURLOPT_TIMEOUT, 10);
curl_multi_add_handle($mc, $c [$thread_no]);
$i++;
}
do {
while (($execrun = curl_multi_exec($mc, $running)) == CURLM_CALL_MULTI_PERFORM) ;
if ($execrun != CURLM_OK) break;
while ($done = curl_multi_info_read($mc)) {
$html=curl_multi_getcontent($done['handle']);
mysql_query("INSERT INTO hb (content) VALUES('$html')");
curl_multi_remove_handle($mc, $done ['handle']);
}
} while ($running);
curl_multi_close($mc);
我的代理直接代码是:
$proxies = $proxyL;
$mc = curl_multi_init();
for ($thread_no = 0; $thread_no < count($proxies); $thread_no++) {
$c [$thread_no] = curl_init();
curl_setopt($c [$thread_no], CURLOPT_URL, SERVER_URL . '/checkProxy.php');
curl_setopt($c [$thread_no], CURLOPT_HEADER, 0);
curl_setopt($c [$thread_no], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($c [$thread_no], CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($c [$thread_no], CURLOPT_TIMEOUT, 10);
curl_setopt($c [$thread_no], CURLOPT_PROXY, trim($proxies [$thread_no]));
curl_setopt($c [$thread_no], CURLOPT_PROXYTYPE, 0);
curl_multi_add_handle($mc, $c [$thread_no]);
}
do {
while (($execrun = curl_multi_exec($mc, $running)) == CURLM_CALL_MULTI_PERFORM) ;
if ($execrun != CURLM_OK) break;
while ($done = curl_multi_info_read($mc)) {
$info = curl_getinfo($done ['handle']);
if (curl_multi_getcontent($done['handle']) == 'ok') {
$proxy[] = $proxies [array_search($done['handle'], $c)];
}
curl_multi_remove_handle($mc, $done ['handle']);
}
} while ($running);
curl_multi_close($mc);