我正在寻找一种更好的方法从卷曲的网址中抓取内容

时间:2016-01-05 12:01:53

标签: php parsing curl screen-scraping

我正在寻找一种更好的方法来从卷曲的网址中抓取内容,我希望你对多线程或其他想法有所了解。我想保存超过5.000.000个网站的HTML代码



function curl_download($Url){
    if (!function_exists('curl_init')){
        die('Sorry cURL is not installed!');
    }
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $Url);
    curl_setopt($ch, CURLOPT_REFERER, "http://www.url.de/?aktion=suche");
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/1.0");
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_TIMEOUT, 10);
    $output = curl_exec($ch);
    curl_close($ch);
    return $output;
}

$i = "1";


while ($i <= 450000)
{
    $html = curl_download('http://www.url.de/id='.$i.'&land=be');
    mysql_query("INSERT INTO hb (content)
    VALUES('$html')");
    echo "$i ";
    $i++;
}
&#13;
&#13;
&#13;

谢谢你的帮助

1 个答案:

答案 0 :(得分:-1)

我正在使用多线程卷曲来快速检查许多代理

这是我为您的需求准备的代码

$mc = curl_multi_init();
$i=1;
while ($i <= 450000) {
    $thread_no = $i;
    $c [$thread_no] = curl_init();
    curl_setopt($c [$thread_no], CURLOPT_URL, 'http://www.url.de/id='.$i.'&land=be');
    curl_setopt($c [$thread_no], CURLOPT_HEADER, 0);
    curl_setopt($c [$thread_no], CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($c [$thread_no], CURLOPT_CONNECTTIMEOUT, 5);
    curl_setopt($c [$thread_no], CURLOPT_TIMEOUT, 10);
    curl_multi_add_handle($mc, $c [$thread_no]);
    $i++;
}

do {
    while (($execrun = curl_multi_exec($mc, $running)) == CURLM_CALL_MULTI_PERFORM) ;
    if ($execrun != CURLM_OK) break;
    while ($done = curl_multi_info_read($mc)) {
        $html=curl_multi_getcontent($done['handle']);
        mysql_query("INSERT INTO hb (content) VALUES('$html')");
        curl_multi_remove_handle($mc, $done ['handle']);
    }
} while ($running);
curl_multi_close($mc);

我的代理直接代码是:

$proxies = $proxyL;
$mc = curl_multi_init();
for ($thread_no = 0; $thread_no < count($proxies); $thread_no++) {
    $c [$thread_no] = curl_init();
    curl_setopt($c [$thread_no], CURLOPT_URL, SERVER_URL . '/checkProxy.php');
    curl_setopt($c [$thread_no], CURLOPT_HEADER, 0);
    curl_setopt($c [$thread_no], CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($c [$thread_no], CURLOPT_CONNECTTIMEOUT, 5);
    curl_setopt($c [$thread_no], CURLOPT_TIMEOUT, 10);
    curl_setopt($c [$thread_no], CURLOPT_PROXY, trim($proxies [$thread_no]));
    curl_setopt($c [$thread_no], CURLOPT_PROXYTYPE, 0);
    curl_multi_add_handle($mc, $c [$thread_no]);
}

do {
    while (($execrun = curl_multi_exec($mc, $running)) == CURLM_CALL_MULTI_PERFORM) ;
    if ($execrun != CURLM_OK) break;
    while ($done = curl_multi_info_read($mc)) {
        $info = curl_getinfo($done ['handle']);
        if (curl_multi_getcontent($done['handle']) == 'ok') {
            $proxy[] = $proxies [array_search($done['handle'], $c)];
        }
        curl_multi_remove_handle($mc, $done ['handle']);
    }
} while ($running);
curl_multi_close($mc);