Question

我一直在研究刮刀一个月并试图废弃从mysql获取的链接（href）。

我已经尽可能多地应用了它我试着用

来打电话

exec shell with curl call for parallel process。
exec shell为并行进程调用php脚本
尝试过没有正确使用的pthread（不知道原因）

我递归调用函数从网站获取链接，然后进一步抓取这些链接。

现在，我得到的几乎是（在过滤无效链接（＃，javascript（void）等）后大约30分钟内有5到6万条记录。它们很可能是重复的结果。如果我从中查询不同的值这个记录，我只有50,000条记录

这是我的代码

 function multiRequest($urls) {

global $link;



$filter_links = array();
$rolling_window = sizeof($urls);

$master = curl_multi_init();


// add additional curl options here
$std_options = array(CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_CONNECTTIMEOUT => 35,
    CURLOPT_HEADER => false,
    CURLOPT_TIMEOUT => 30);
$options = $std_options;

// start the first batch of requests
for ($i = 0; $i < $rolling_window; $i++) {
    $ch = curl_init();
    $options[CURLOPT_URL] = $urls[$i];
    $options[CURLOPT_PRIVATE] = $urls[$i];
    curl_setopt_array($ch, $options);
    curl_multi_add_handle($master, $ch);
}

do {
    while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
    if ($execrun != CURLM_OK) {
        break;
    }
    // a request was just completed -- find out which one
    while ($done = curl_multi_info_read($master)) {

        $available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE);

        $html = curl_multi_getcontent($done['handle']);

        $domDoc = new DOMDocument('1.0');
        @$domDoc->loadHTML($html);

        $anchors = $domDoc->getElementsByTagName('a');
        foreach ($anchors as $element) {
            $href = $element->getAttribute('href');
            $href = rtrim($href, "/");
            $href = trim($href);

            if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl ||  (strpos($href, 'javascript:') !== false)  || (strpos($href, 'index.php') !== false)  || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) ||
                    (strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) {
                continue;
            }
            if (0 !== strpos($href, 'http')) {
                $path = '/' . ltrim($href, '/');


                $parts = parse_url($available_curl);

                $href = $parts['scheme'] . '://';

                $href .= $parts['host'];
                if (isset($parts['port'])) {
                    $href .= ':' . $parts['port'];
                }
                $href .=$path;
            }


                $href = rtrim($href, "/");
                $filter_links[] = $href;

        }

        $filter_links = array_unique($filter_links);
        $scraped_domain = remove_http($available_curl);
        $scraped_domain_key = key_domain_generator($scraped_domain);
        mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link));
        $namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl);

        curl_multi_remove_handle($master, $done['handle']);
    }
} while ($running);

curl_multi_close($master);
if (count($namecheap_filter_internal_array) > 0) {

   multiRequest($namecheap_filter_internal_array);
}

}

function extrnl_intrnl_filter($href_array, $domain_link) {

global $link;
$is_external = 0;
$workers = [];
$x_count=0;
foreach ($href_array as $href) {
    $href_url = parse_url($href);
    $href_domain = $href_url['host'];
    $key_href = giveHost($href_domain);
    if (isexternal($href_domain, $domain_link) == 'External') {
        $domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'";
        $domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link));
        $domaininfo = mysqli_fetch_assoc($domains_run_Query);
        if ($domaininfo['domain_found'] > 0) {

        } else {
            if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) {
                $is_external = 1;
                if (domain_insert_check($href, $is_external)) {
                    echo 'prgress';
                    $workers[$x_count] = new WorkerThreads($href);
                    $workers[$x_count]->start();
                    $x_count++;


                    //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &");
                    //exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &");

                    //exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
                    //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
                }
            }
        }
    } else {
        $is_external = 0;
        if (domain_insert_check($href, $is_external)) {
            $workers[$x_count] = new WorkerThreads($href);
            $workers[$x_count]->start();
            $x_count++;
            $namecheap_filter_internal_array[] = $href;

        }
    }
}
 for ($forvar=0;$forvar<$x_count;$forvar++) {
     $workers[$forvar]->join();
 }

return array_unique($namecheap_filter_internal_array);

}

function domain_insert_check($href, $is_external) {
global $link;
$href_url = parse_url($href);
$href_ex_https = remove_http($href);
$href_domain = $href_url['host'];
$href_scheme = $href_url['scheme'];
$key_href_i = key_domain_generator($href_ex_https);

$query = "insert into domains set domain_name = '" .    addslashes($href_ex_https) . "',"
        . "doamin_schema = '" . $href_scheme . "',"
        . "base_url = '" . strtolower(giveHost($href_domain)) . "',"
        . "domain_u_key = '" . $key_href_i . "',"
        . "is_expired = '0',"
        . "is_scraped = '0',"
        . "is_external = '" . $is_external . "',"
        . "ExtBackLinks = '0',"
        . "RefDomains='0',"
        . "ACRank = '0',"
        . "RefIPs = '0',"
        . "RefSubNets = '0',"
        . "RefDomainsEDU = '0',"
        . "RefDomainsGOV = '0',"
        . "Title = 'title',"
        . "total_scraped_links = '0',"
        . "CitationFlow = '0',"
        . "TrustFlow = '0',"
        . "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0',"
        . "TopicalTrustFlow_Value_0 = '0',"
        . "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1',"
        . "TopicalTrustFlow_Value_1 = '0',"
        . "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2',"
        . "TopicalTrustFlow_Value_2 = '0',"
        . "date_created = '" . date('Y-m-d H:i:s') . "',"
        . "user_id = 1";

$result = mysqli_query($link, $query);
if (!$result) {
    mysqli_query($link, "insert into domainerror SET  error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'");

    return false;
} else {
    return true;
}

}

我真的不知道如何优化它这样它就能抓住更多的记录我尽我所能优化它如果我使用php调用而不是curl，它会阻塞mysql max连接如果我使用pthread，它会先运行然后停止

Answer 1

我的第一个建议是删除DOMDocument以替换Regex，它更好更快，内存占用更少，解析时间更短。

其他较小的改进是替换例如使用o（1）进行子阵列搜索，尽可能使用散列图。

$filter_links = array_unique($filter_links);

所以不应该这样，你应该有一个$ urlMap [$ urlKey] = $ url; 如果你没有找到，那就继续插入。计算密钥的快速方法可以是md5，但有更快的方法。

我看到的另一个重要的I / O问题是，您抓取的每个网站都会插入数据库中。您可以将它分成另一个包含数据的数组，最后将所有网站数据插入到您的sql server中。

尽管如此，您仍将获得一些加速，但为了扩展，您将不得不考虑将流程拆分为多个服务器的方法。为此你需要一个队列系统，你可以使用RabbitMq https://www.rabbitmq.com/

更快地废弃PHP递归刮刀

1 个答案: