我知道这不是最小的代码,我试图尽可能地削减它。该脚本消耗的内存越来越多,直到它最终用完为止。我尽可能使用了unset(),但似乎没有任何效果。在MultiGet函数中似乎总是出错,但我不确定这是否是泄漏的地方。任何意见都将不胜感激。
public function Test()
{
$base = dirname(__FILE__) .'/';
$prod_file = $base.'products.dbf';
$this->dbf->load($prod_file);
$num_rec=$ci->dbf->dbf_num_rec;
$buffer = Array();
for($i=0;$i<$num_rec;$i++):
$row = $ci->dbf->getRowAssoc($i);
$info = Array('part_number' => $row['PART_NUM'],
'td_group_id' => $row['GRP'],
'name' => 'DESCR');
$this->db->where('td_group_id',$info['td_group_id']);
$result = $this->db->get('tbl_categories')->row_array();
if(isset($result['id'])):
$info['category_id'] = $result['id'];
$buffer[] = $info;
endif;
if(count($buffer) == 100 || $i == $num_rec -1):
$url_buffer = Array();
foreach($buffer as $row):
$url_buffer[] = $this->_product_url($row['part_number']);
endforeach;
$html_returns = $this->MultiCrawl($url_buffer);
foreach($html_returns as $url_index=>$html):
$more_info = $this->_extract_more_info($html);
if($more_info):
$more_info['category_id'] = $buffer[$url_index]['category_id'];
$more_info['td_part_number'] = $buffer[$url_index]['part_number'];
$this->_parse_product($more_info);
endif;
endforeach;
$buffer = Array();
endif;
endfor;
}
function MultiGet($all_urls)
{
$useragent = $this->_useragent;
$cookie_file = $this->_cookie_file;
$url_index = $this->UrlIndex($all_urls);
$return_buffer = Array();
$mh = curl_multi_init();
$ch = Array();
$max_connections = 15;
$index = 0;
$open_connections = 0;
$execReturnValue = true;
$running = true;
$max_index = count($all_urls)-1;
$url_count = count($all_urls);
$buffer_count = 0;
while ($buffer_count < $url_count){
if($open_connections < $max_connections && $index <= $max_index):
for($i=$open_connections;$i<$max_connections && $index <= $max_index;$i++):
$url = $all_urls[$index];
$ch[$index] = curl_init($url);
curl_setopt($ch[$index],CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch[$index],CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch[$index],CURLOPT_COOKIESESSION, false);
curl_setopt($ch[$index],CURLOPT_SSL_VERIFYHOST , false);
curl_setopt($ch[$index],CURLOPT_SSL_VERIFYPEER , false);
curl_setopt($ch[$index],CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch[$index],CURLOPT_COOKIEFILE, $cookie_file);
curl_setopt($ch[$index],CURLOPT_USERAGENT,$useragent);
curl_multi_add_handle($mh, $ch[$index]);
$open_connections++;
$index++;
$execReturnValue = curl_multi_exec($mh,$running);
usleep(200);
endfor;
endif;
$execReturnValue = curl_multi_exec($mh,$running);
$ready=curl_multi_select($mh);
while($info=curl_multi_info_read($mh)){
$status=curl_getinfo($info['handle'],CURLINFO_HTTP_CODE);
if($status==200){
$successUrl=curl_getinfo($info['handle'],CURLINFO_EFFECTIVE_URL);
$curl_index = $url_index[$successUrl];
$return_buffer[$curl_index] = curl_multi_getcontent($ch[$curl_index]);
$buffer_count = count($return_buffer);
curl_multi_remove_handle($mh, $ch[$curl_index]);
curl_close($ch[$curl_index]);
unset($ch[$curl_index]);
$open_connections--;
}else{
echo "ERROR: $status\n";
}
}
}
curl_multi_close($mh);
unset($mh);
return $return_buffer;
}
private function _extract_more_info($html)
{
$buffer = array();
$query = "//img[@id='ctl00_cphMain_cntrlProductProfile_imgprodimage']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['td_img_url'] = $node?trim($node->getAttribute('src')):null;
unset($result);
$query = "//span[@class='priceLarge']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['price'] = $node?trim($node->nodeValue):null;
if($buffer['price'] == 'Req. Auth.') return null;
unset($result);
$query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLtFinalPrice']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['msrp'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLTMRF']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['manf_part_number'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLblUPC']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
$buffer['upc_part_number'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//td[@class='black_text_WUL']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['manufacturer'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//td[@class='textt' and @colspan='3']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['short_description'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//div[@id='ctl00_cphMain_pnlMarketingDesc']//td[@class='textt']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['long_description'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//table[@id='ctl00_cphMain_cntrlMainSpecs_dgSpecs']";
$result = $this->_xquery($html,$query);
$table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
unset($result);
if(!$table) return null;
$table_array = Array();
$rows = $table->getElementsByTagName('tr');
foreach($rows as $tr):
$temp = Array();
$columns = $tr->getElementsByTagName('td');
$caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
$value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;
if ($caption) $table_array[$caption] = $value;
endforeach;
$buffer['main_specs']=$table_array;
$query = "//table[@id='ctl00_cphMain_cntrlExtSpecs_tblData']";
$result = $this->_xquery($html,$query);
$table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
unset($result);
$buffer['additional_specs'] = null;
if(!$table) return $buffer;
$table_array = Array();
$rows = $table->getElementsByTagName('tr');
foreach($rows as $tr):
$temp = Array();
$columns = $tr->getElementsByTagName('td');
$caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
$value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;
if ($caption) $table_array[$caption] = $value;
endforeach;
$buffer['additional_specs']=$table_array;;
return $buffer;
}
private function _xquery($html,$query,$allnodes = false){
$src = '';
$dom = new DOMDocument();
$node = null;
if (@$dom->loadHTML($html)) {
$xpath = new DOMXpath($dom);
$nodeList = $xpath->query($query);
if ($nodeList->length > 0) {
$node = $allnodes==false?$nodeList->item(0):$nodeList;
}
}
unset($xpath);
unset($nodeList);
unset($dom);
return $node;
}
答案 0 :(得分:1)
找到泄漏的策略?
foo
是O(n),bar
是O(n)和bar
调用foo
,结果可能会变为O(n * n)中。MultiCrawl
方法进行抓取。你确定那里不会有循环吗? (使用文件夹不止一次欺骗我:浏览'。'作为子文件夹产生无限循环)