我一直在研究刮刀一个月并试图废弃从mysql获取的链接(href)。
我已经尽可能多地应用了它 我试着用
来打电话我递归调用函数从网站获取链接,然后进一步抓取这些链接。
现在,我得到的几乎是(在过滤无效链接(#,javascript(void)等)后大约30分钟内有5到6万条记录。它们很可能是重复的结果。如果我从中查询不同的值这个记录,我只有50,000条记录
这是我的代码
function multiRequest($urls) {
global $link;
$filter_links = array();
$rolling_window = sizeof($urls);
$master = curl_multi_init();
// add additional curl options here
$std_options = array(CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_CONNECTTIMEOUT => 35,
CURLOPT_HEADER => false,
CURLOPT_TIMEOUT => 30);
$options = $std_options;
// start the first batch of requests
for ($i = 0; $i < $rolling_window; $i++) {
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i];
$options[CURLOPT_PRIVATE] = $urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if ($execrun != CURLM_OK) {
break;
}
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($master)) {
$available_curl = curl_getinfo($done['handle'], CURLINFO_PRIVATE);
$html = curl_multi_getcontent($done['handle']);
$domDoc = new DOMDocument('1.0');
@$domDoc->loadHTML($html);
$anchors = $domDoc->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
$href = rtrim($href, "/");
$href = trim($href);
if ((strpos($href, '#') !== false) || $href == '' || $href == $available_curl || (strpos($href, 'javascript:') !== false) || (strpos($href, 'index.php') !== false) || preg_match('/mailto:/', $href) || (strpos($href, '.jpg') !== false) || (strpos($href, '.jpeg') !== false) || (strpos($href, '.png') !== false) ||
(strpos($href, '.gif') !== false) || (strpos($href, '.tiff') !== false) || (strpos($href, '.tif') !== false) || (strpos($href, '.pdf') !== false)) {
continue;
}
if (0 !== strpos($href, 'http')) {
$path = '/' . ltrim($href, '/');
$parts = parse_url($available_curl);
$href = $parts['scheme'] . '://';
$href .= $parts['host'];
if (isset($parts['port'])) {
$href .= ':' . $parts['port'];
}
$href .=$path;
}
$href = rtrim($href, "/");
$filter_links[] = $href;
}
$filter_links = array_unique($filter_links);
$scraped_domain = remove_http($available_curl);
$scraped_domain_key = key_domain_generator($scraped_domain);
mysqli_query($link, "UPDATE domains SET is_scraped=1, total_scraped_links = '" . count($filter_links) . "' WHERE domain_u_key = '" . $scraped_domain_key . "'") or die(mysqli_error($link));
$namecheap_filter_internal_array=extrnl_intrnl_filter($filter_links, $available_curl);
curl_multi_remove_handle($master, $done['handle']);
}
} while ($running);
curl_multi_close($master);
if (count($namecheap_filter_internal_array) > 0) {
multiRequest($namecheap_filter_internal_array);
}
}
function extrnl_intrnl_filter($href_array, $domain_link) {
global $link;
$is_external = 0;
$workers = [];
$x_count=0;
foreach ($href_array as $href) {
$href_url = parse_url($href);
$href_domain = $href_url['host'];
$key_href = giveHost($href_domain);
if (isexternal($href_domain, $domain_link) == 'External') {
$domains_Query = "select count(*) as domain_found from domains where base_url='$key_href'";
$domains_run_Query = mysqli_query($link, $domains_Query) or die(mysqli_error($link));
$domaininfo = mysqli_fetch_assoc($domains_run_Query);
if ($domaininfo['domain_found'] > 0) {
} else {
if (preg_match('/^[-a-z0-9]+\.[a-z]{2,6}$/', strtolower($key_href))) {
$is_external = 1;
if (domain_insert_check($href, $is_external)) {
echo 'prgress';
$workers[$x_count] = new WorkerThreads($href);
$workers[$x_count]->start();
$x_count++;
//exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $domain_list_scrap . " > /dev/null 2> /dev/null &");
//exec("nohup php /var/www/test/tool2/index2.php " . $href . " > /dev/null 2> /dev/null &");
//exec("nohup php /var/www/test/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
//exec("nohup curl --url http://37.59.1.141/tool2/index2.php?data=" . $href . " > /dev/null 2> /dev/null &");
}
}
}
} else {
$is_external = 0;
if (domain_insert_check($href, $is_external)) {
$workers[$x_count] = new WorkerThreads($href);
$workers[$x_count]->start();
$x_count++;
$namecheap_filter_internal_array[] = $href;
}
}
}
for ($forvar=0;$forvar<$x_count;$forvar++) {
$workers[$forvar]->join();
}
return array_unique($namecheap_filter_internal_array);
}
function domain_insert_check($href, $is_external) {
global $link;
$href_url = parse_url($href);
$href_ex_https = remove_http($href);
$href_domain = $href_url['host'];
$href_scheme = $href_url['scheme'];
$key_href_i = key_domain_generator($href_ex_https);
$query = "insert into domains set domain_name = '" . addslashes($href_ex_https) . "',"
. "doamin_schema = '" . $href_scheme . "',"
. "base_url = '" . strtolower(giveHost($href_domain)) . "',"
. "domain_u_key = '" . $key_href_i . "',"
. "is_expired = '0',"
. "is_scraped = '0',"
. "is_external = '" . $is_external . "',"
. "ExtBackLinks = '0',"
. "RefDomains='0',"
. "ACRank = '0',"
. "RefIPs = '0',"
. "RefSubNets = '0',"
. "RefDomainsEDU = '0',"
. "RefDomainsGOV = '0',"
. "Title = 'title',"
. "total_scraped_links = '0',"
. "CitationFlow = '0',"
. "TrustFlow = '0',"
. "TopicalTrustFlow_Topic_0 = 'TopicalTrustFlow_Topic_0',"
. "TopicalTrustFlow_Value_0 = '0',"
. "TopicalTrustFlow_Topic_1 = 'TopicalTrustFlow_Topic_1',"
. "TopicalTrustFlow_Value_1 = '0',"
. "TopicalTrustFlow_Topic_2 = 'TopicalTrustFlow_Topic_2',"
. "TopicalTrustFlow_Value_2 = '0',"
. "date_created = '" . date('Y-m-d H:i:s') . "',"
. "user_id = 1";
$result = mysqli_query($link, $query);
if (!$result) {
mysqli_query($link, "insert into domainerror SET error = '" . $key_href_i . "' , domains= '" . $href_ex_https . "', type='fail'");
return false;
} else {
return true;
}
}
我真的不知道如何优化它 这样它就能抓住更多的记录 我尽我所能优化它 如果我使用php调用而不是curl,它会阻塞mysql max连接 如果我使用pthread,它会先运行然后停止
答案 0 :(得分:0)
我的第一个建议是删除DOMDocument以替换Regex,它更好更快,内存占用更少,解析时间更短。
其他较小的改进是替换例如使用o(1)进行子阵列搜索,尽可能使用散列图。
$filter_links = array_unique($filter_links);
所以不应该这样,你应该有一个$ urlMap [$ urlKey] = $ url; 如果你没有找到,那就继续插入。计算密钥的快速方法可以是md5,但有更快的方法。
我看到的另一个重要的I / O问题是,您抓取的每个网站都会插入数据库中。您可以将它分成另一个包含数据的数组,最后将所有网站数据插入到您的sql server中。
尽管如此,您仍将获得一些加速,但为了扩展,您将不得不考虑将流程拆分为多个服务器的方法。为此你需要一个队列系统,你可以使用RabbitMq https://www.rabbitmq.com/