我想抓一个有3级抓取的网站。我得到所有页面,然后在每个页面中,我得到图像,标题,网址,将我重定向到一个唯一的页面,其中包含更多信息,如描述,日期,....所以,如果我使用foreach
,它会给我错误的结果,如果我使用for
代替foreach
,它只返回一个对象。我该如何处理(使用for
代替foreach
);
<?php
$stackHref=array();
$eventDetail=array();
$sitecontent = file_get_contents('https://www.everfest.com/music/edm-festivals');
if($sitecontent === FALSE) {
$error_log .= 'Error on $sitecontent = file_get_contents(https://www.everfest.com/music/edm-festivals) ';
//insert_error($error_log);
}
// echo $sitecontent;
$dom = new DOMDocument();
libxml_use_internal_errors(true);
$dom->loadHTML($sitecontent);
libxml_use_internal_errors(false);
$xpath = new DOMXPath($dom);
$nodes = $xpath->query("(//ul[@class='pagination'])[1]/li/a/@href");
// $all_area_set= ' ';
//echo $sitecontent;
if(!isset($nodes))
{
$error_log .= "Error on $nodes = $xpath->query((//ul[@class='pagination'])[1]/li/a/@href)";
//insert_error($error_log);
echo $error_log;
}
// get total pages
foreach ($nodes as $link) {
$stackHref[]='https://www.everfest.com'.$link->nodeValue;
}
//loop through each pages in order to scrape
$j=0;
for($i=0;$i<count($stackHref);$i++){
$sitecontent=file_get_contents($stackHref[$i]);
if($sitecontent === FALSE) {
$error_log .= 'Error on $sitecontent = file_get_contents(https://www.everfest.com/music/edm-festivals) ';
//insert_error($error_log);
}
$dom= new DOMDocument();
libxml_use_internal_errors(TRUE);
$dom->loadHTML($sitecontent);
libxml_use_internal_errors(FALSE);
$innerXpath= new DOMXPath($dom);
//get page link
$pageLinks= $innerXpath->query('//div[@class="festival-card grow"]/a[1]/@href');
for ($a=0;$a <$pageLinks->length;$a++ ){
//get img src
$eventDetail[$j]['pagelink']='https://www.everfest.com'.$pageLinks[$a]->nodeValue;
$images= $innerXpath->query("//div[contains(@class,'columns medium-6 large-4')]/div[contains(@class,'grow')]/a/img/@src");
$eventDetail[$j]['img']=$images[$a]->nodeValue;
//get title
$titles= $innerXpath->query("//div[contains(@class,'clearfix')]/a[1]/text()");
$eventDetail[$j]['title']=$titles[$a]->nodeValue;
// go inside of each pages in order to get description, date, venue
$sitecontent=file_get_contents($eventDetail[$j]['pagelink']);
$dom= new DOMDocument();
libxml_use_internal_errors(TRUE);
$dom->loadHTML($sitecontent);
libxml_use_internal_errors(FALSE);
$deepxpath= new DOMXPath($dom);
$descriptions= $deepxpath->query('//div[@class="columns"]/div[contains(@class,"card-white")]/p[contains(@class,"")]/span[1]/following-sibling::text()[1]');
$eventDetail[$j]['description']=$descriptions[$a]->nodeValue;
//get date
$dates= $deepxpath->query('//div[@id="signup"]/div[@class="row"]/div[contains(@class,"columns")][1]/p/text()[1]');
$eventDetail[$j]['Date']=$dates[$a]->nodeValue;
//get venue
$venues= $deepxpath->query('//div[@id="signup"]/div[@class="row"]/div[contains(@class,"columns")][1]/p/text()[2]');
$eventDetail[$j++]['venue']=$venues[$a]->nodeValue;
}
}
?>