更快地废弃PHP递归刮刀

时间:2016-06-25 09:22:00

标签: php multithreading web-scraping

我一直在研究刮刀一个月并试图废弃从mysql获取的链接(href)。

我已经尽可能多地应用了它  我试着用

来打电话
  1. exec shell with curl call for parallel process。
  2. exec shell为并行进程调用php脚本
  3. 尝试过没有正确使用的pthread(不知道原因)
  4. 我递归调用函数从网站获取链接,然后进一步抓取这些链接。

    现在,我得到的几乎是(在过滤无效链接(#,javascript(void)等)后大约30分钟内有5到6万条记录。它们很可能是重复的结果。如果我从中查询不同的值这个记录,我只有50,000条记录

    这是我的代码

     function multiRequest($urls) {
    
    global $link;
    
    
    
    $filter_links = array();
    $rolling_window = sizeof($urls);
    
    $master = curl_multi_init();
    
    
    // add additional curl options here
    $std_options = array(CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_CONNECTTIMEOUT => 35,
        CURLOPT_HEADER => false,
        CURLOPT_TIMEOUT => 30);
    $options = $std_options;
    
    // start the first batch of requests
    for ($i = 0; $i < $rolling_window; $i++) {
        $ch = curl_init();
        $options[CURLOPT_URL] = $urls[$i];
        $options[CURLOPT_PRIVATE] = $urls[$i];
        curl_setopt_array($ch, $options);
        curl_multi_add_handle($master, $ch);
    }
    
    do {
        while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
        if ($execrun != CURLM_OK) {
            break;
        }
        // a request was just completed -- find out which one
        while ($done = curl_multi_info_read($master)) {
    
            $available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE);
    
            $html = curl_multi_getcontent($done['handle']);
    
            $domDoc = new DOMDocument('1.0');
            @$domDoc->loadHTML($html);
    
            $anchors = $domDoc->getElementsByTagName('a');
            foreach ($anchors as $element) {
                $href = $element->getAttribute('href');
                $href = rtrim($href, "/");
                $href = trim($href);
    
                if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl ||  (strpos($href, 'javascript:') !== false)  || (strpos($href, 'index.php') !== false)  || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) ||
                        (strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) {
                    continue;
                }
                if (0 !== strpos($href, 'http')) {
                    $path = '/' . ltrim($href, '/');
    
    
                    $parts = parse_url($available_curl);
    
                    $href = $parts['scheme'] . '://';
    
                    $href .= $parts['host'];
                    if (isset($parts['port'])) {
                        $href .= ':' . $parts['port'];
                    }
                    $href .=$path;
                }
    
    
                    $href = rtrim($href, "/");
                    $filter_links[] = $href;
    
            }
    
            $filter_links = array_unique($filter_links);
            $scraped_domain = remove_http($available_curl);
            $scraped_domain_key = key_domain_generator($scraped_domain);
            mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link));
            $namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl);
    
            curl_multi_remove_handle($master, $done['handle']);
        }
    } while ($running);
    
    curl_multi_close($master);
    if (count($namecheap_filter_internal_array) > 0) {
    
       multiRequest($namecheap_filter_internal_array);
    }
    

    }

    function extrnl_intrnl_filter($href_array, $domain_link) {
    
    global $link;
    $is_external = 0;
    $workers = [];
    $x_count=0;
    foreach ($href_array as $href) {
        $href_url = parse_url($href);
        $href_domain = $href_url['host'];
        $key_href = giveHost($href_domain);
        if (isexternal($href_domain, $domain_link) == 'External') {
            $domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'";
            $domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link));
            $domaininfo = mysqli_fetch_assoc($domains_run_Query);
            if ($domaininfo['domain_found'] > 0) {
    
            } else {
                if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) {
                    $is_external = 1;
                    if (domain_insert_check($href, $is_external)) {
                        echo 'prgress';
                        $workers[$x_count] = new WorkerThreads($href);
                        $workers[$x_count]->start();
                        $x_count++;
    
    
                        //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &");
                        //exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &");
    
                        //exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
                        //exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
                    }
                }
            }
        } else {
            $is_external = 0;
            if (domain_insert_check($href, $is_external)) {
                $workers[$x_count] = new WorkerThreads($href);
                $workers[$x_count]->start();
                $x_count++;
                $namecheap_filter_internal_array[] = $href;
    
            }
        }
    }
     for ($forvar=0;$forvar<$x_count;$forvar++) {
         $workers[$forvar]->join();
     }
    
    return array_unique($namecheap_filter_internal_array);
    

    }

    function domain_insert_check($href, $is_external) {
    global $link;
    $href_url = parse_url($href);
    $href_ex_https = remove_http($href);
    $href_domain = $href_url['host'];
    $href_scheme = $href_url['scheme'];
    $key_href_i = key_domain_generator($href_ex_https);
    
    $query = "insert into domains set domain_name = '" .    addslashes($href_ex_https) . "',"
            . "doamin_schema = '" . $href_scheme . "',"
            . "base_url = '" . strtolower(giveHost($href_domain)) . "',"
            . "domain_u_key = '" . $key_href_i . "',"
            . "is_expired = '0',"
            . "is_scraped = '0',"
            . "is_external = '" . $is_external . "',"
            . "ExtBackLinks = '0',"
            . "RefDomains='0',"
            . "ACRank = '0',"
            . "RefIPs = '0',"
            . "RefSubNets = '0',"
            . "RefDomainsEDU = '0',"
            . "RefDomainsGOV = '0',"
            . "Title = 'title',"
            . "total_scraped_links = '0',"
            . "CitationFlow = '0',"
            . "TrustFlow = '0',"
            . "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0',"
            . "TopicalTrustFlow_Value_0 = '0',"
            . "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1',"
            . "TopicalTrustFlow_Value_1 = '0',"
            . "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2',"
            . "TopicalTrustFlow_Value_2 = '0',"
            . "date_created = '" . date('Y-m-d H:i:s') . "',"
            . "user_id = 1";
    
    $result = mysqli_query($link, $query);
    if (!$result) {
        mysqli_query($link, "insert into domainerror SET  error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'");
    
        return false;
    } else {
        return true;
    }
    

    }

    我真的不知道如何优化它 这样它就能抓住更多的记录 我尽我所能优化它 如果我使用php调用而不是curl,它会阻塞mysql max连接 如果我使用pthread,它会先运行然后停止

1 个答案:

答案 0 :(得分:0)

我的第一个建议是删除DOMDocument以替换Regex,它更好更快,内存占用更少,解析时间更短。

其他较小的改进是替换例如使用o(1)进行子阵列搜索,尽可能使用散列图。

$filter_links = array_unique($filter_links); 

所以不应该这样,你应该有一个$ urlMap [$ urlKey] = $ url; 如果你没有找到,那就继续插入。计算密钥的快速方法可以是md5,但有更快的方法。

我看到的另一个重要的I / O问题是,您抓取的每个网站都会插入数据库中。您可以将它分成另一个包含数据的数组,最后将所有网站数据插入到您的sql server中。

尽管如此,您仍将获得一些加速,但为了扩展,您将不得不考虑将流程拆分为多个服务器的方法。为此你需要一个队列系统,你可以使用RabbitMq https://www.rabbitmq.com/