我正在尝试使用PHP从网站抓取图片。
我要抓取的页面是: http://www.reebonz.com.sg/event/t7349#/event/t7349
但是使用我的代码我只得到我的标题的href。我的代码是:
<?php
require_once ('function.php');
$advt_id = "88477";
$programurl = "http://www.reebonz.com.sg/event_list/1/";
$baseurl = "http://www.reebonz.com.sg/event_list/1/";
$crawl_data []= array ( "department" => 0, "category" => "bags" , "advt_cat" => "BALENCIAGA", "cat_url" => 'http://www.reebonz.com.sg/event/t7349#/event/t7349');
$data = get_data($url);
$product_raw = splice_data ($data, 'ul class="rec-items-ul ng-scope"',1, '</ul>',1);
$product_list = splice_list ($product_raw, 'href="', '"');
echo "\n**** Got Product List ".count($product_list)." ***\n";
print_r ($product_list);
foreach ($product_list as $product)
{
if ((strlen($product) < 10))
{
echo $product;
continue;
}
$url = $baseurl.$product;
$data = get_data($url);
$img_data = splice_data ($data, 'class="rbz_product-zoom-image row"', 1, '</div>', 1);
$img_url = splice_data ($img_data, 'href="',1, '"', 1);
echo $img_url;
$filePath = $crawl_cat['category']."\\".$crawl_cat['advt_cat'];
if (!file_exists($filePath)) {
mkdir($filePath, 0777, true);
}
grab_image($img_url,$filePath);
//grab_image($img_url5,$filePath);
echo "*";
}// end of product insert for
?>
function.php是:
function splice_data ($data, $startstr, $startoccur, $endstr, $endoccur)
{
if ($startoccur > 1)
{
for ($i = 1, $startpos = 1 ; $startoccur >= $i; $i++, $startpos++)
{
$startpos = stripos($data,$startstr,$startpos);
//echo $startpos. "\n";
}
$start = $startpos;
}
else
$start = stripos($data,$startstr,$startoccur);
$start_index = strlen($startstr);
$end = stripos($data,$endstr,$start + $start_index ) ;
$splice_data = substr($data,$start + $start_index, $end - ($start + $start_index) );
return $splice_data;
}
function splice_list ($img_data, $start_str, $end_str, $find = '', $replace = '')
{
for ($i = 1, $j = 1; stripos($img_data,$start_str,$i) > 1 ;)
{
$start = stripos($img_data,$start_str,$i);
$start_len = strlen($start_str);
$end = stripos($img_data,$end_str,$start + $start_len) ;
$data_list[] = str_replace($find,$replace,substr($img_data, $start + $start_len , $end - $start - $start_len)) ;
$i = $end;
$j++;
}
$result = array_unique($data_list);
return $result;
}
function get_data($url, $ckfile="", $cookie="")
{
$toCheckURL = $url;
// This all sets up the CURL actions to check the page
$header=array(
// 'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: en-us,en;q=0.5',
'Accept-Encoding: gzip,deflate',
'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Keep-Alive: 115',
'Connection: keep-alive',
);
$proxies = array();
$ch = curl_init();
if (isset($proxy)) { // If the $proxy variable is set, then
curl_setopt($ch, CURLOPT_PROXY, $proxy); // Set CURLOPT_PROXY with proxy in $proxy variable
}
curl_setopt($ch, CURLOPT_URL, $toCheckURL);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, false);
if (isset($ckfile) && $ckfile !="" and !empty($ckfile))
{
curl_setopt ($ch, CURLOPT_COOKIEFILE, $ckfile);
}
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,60);
curl_setopt($ch, CURLOPT_TIMEOUT,90);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10); //follow up to 10 redirections - avoids loops
if($cookie != "")
curl_setopt($ch,CURLOPT_HTTPHEADER, array($cookie));
curl_setopt($ch,CURLOPT_USERAGENT,$agents[array_rand($agents)]);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
我现在得到的输出是:
**** Got Product List 8 *** Array ( [0] => //netdna.bootstrapcdn.com/twitter-
bootstrap/2.3.2/css/bootstrap-combined.no-icons.min.css [1] => //netdna.bootstrapcdn.com/font-
awesome/3.2.1/css/font-awesome.css [2] =>
http://www.reebonz.com.sg/sites/all/themes/custom/octopus2/xfavicon.ico.pagespeed.ic.jT8Y7LgYBc.png
[3] => http://www.octopus2.local/sites/all/themes/custom/octopus2/css/reebonz-ie.css [4] =>
http://www.reebonz.com.sg/sites/all/modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.core.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.theme.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.slider.min.css,qn1a78z+modules,_contrib,_panels,_css,_panels.css,qn1a78z+modules,_custom,_mailcheck,_css,_mailcheck.css,qn1a78z+themes,_custom,_octopus2,_css,_bootstrap.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-core.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-social-network.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-....
我的代码有什么问题?有没有简单的方法呢?
答案 0 :(得分:0)
使用php DomDocument:
$doc = new DOMDocument();
$doc->loadHTML(your_html_code);
$images = $doc->getElementsByTagName('img');
foreach ($images as $img) {
//do whatever you like
}
答案 1 :(得分:0)
下载此库:http://sourceforge.net/projects/simplehtmldom/
以下代码将起作用
(在顶部包含该库)
<?php
error_reporting(1);
include_once('simple_html_dom.php');
$html = new simple_html_dom();
$html->load_file('https://www.google.co.in/search?q=shahrukh+khan&newwindow=1&biw=1375&bih=791&source=lnms&tbm=isch&sa=X&sqi=2&ved=0ahUKEwi1rO6AjZrKAhWSBY4KHWSGBDQQ_AUIBygC');
$reviews = $html->find('img');
$fetched_images = '';
foreach($reviews as $link)
{
//find review ID if not null
if($link->{'src'} != ''){
$review_ID = $link->{'src'};
$fetched_images[] = $review_ID;
}
}
?>
<ul>
<?php foreach ($fetched_images as $fetched_image): ?>
<li style="display:inline-block"><img src="<?php echo $fetched_image;?>"></li>
<?php endforeach ?>
</ul>
答案 2 :(得分:0)
<?php
include_once('simple_html_dom.php');
$target_url = "Your URL here";
$html = new simple_html_dom();
$html->load_file($target_url);
$images = $html->find('img');
/**foreach($images as $link){
//find review ID if not null
if($link->{'src'} != ''){
$image_ID = $link->{'src'};
$fetched_images[] = $image_ID;
}
}*/
foreach ($images as $fetched_image){
echo $fetched_image;
}
?>