从网站上抓取图像

时间:2014-03-03 07:50:17

标签: php curl web-crawler

我正在尝试使用PHP从网站抓取图片。

我要抓取的页面是: http://www.reebonz.com.sg/event/t7349#/event/t7349

但是使用我的代码我只得到我的标题的href。我的代码是:

<?php

require_once ('function.php');

$advt_id = "88477";

$programurl = "http://www.reebonz.com.sg/event_list/1/";

$baseurl = "http://www.reebonz.com.sg/event_list/1/";

$crawl_data []= array ( "department" => 0, "category" => "bags" , "advt_cat" => "BALENCIAGA", "cat_url" => 'http://www.reebonz.com.sg/event/t7349#/event/t7349');


    $data =  get_data($url);

    $product_raw =  splice_data ($data, 'ul class="rec-items-ul ng-scope"',1, '</ul>',1); 

    $product_list = splice_list ($product_raw, 'href="', '"');  

    echo "\n**** Got Product List ".count($product_list)." ***\n";

        print_r ($product_list);

            foreach ($product_list as $product)
                {   

                    if ((strlen($product) < 10))
                        {
                        echo $product;
                        continue;
                        }

                    $url = $baseurl.$product;

                    $data =  get_data($url);


                    $img_data =  splice_data ($data, 'class="rbz_product-zoom-image row"', 1, '</div>', 1);


                    $img_url = splice_data ($img_data, 'href="',1, '"', 1);



                    echo $img_url;

                    $filePath = $crawl_cat['category']."\\".$crawl_cat['advt_cat'];
                    if (!file_exists($filePath)) {
                        mkdir($filePath, 0777, true);
                    }
                    grab_image($img_url,$filePath);
                    //grab_image($img_url5,$filePath);

                    echo "*";   


                }// end of product insert for

?>

function.php是:

function splice_data ($data, $startstr, $startoccur, $endstr, $endoccur)
{

    if ($startoccur > 1)
    {
    for ($i = 1, $startpos = 1 ; $startoccur >= $i; $i++, $startpos++)
        {
            $startpos = stripos($data,$startstr,$startpos);
            //echo $startpos. "\n";

        }
        $start = $startpos; 
    }

    else
    $start = stripos($data,$startstr,$startoccur);

    $start_index = strlen($startstr);



    $end = stripos($data,$endstr,$start + $start_index ) ;

    $splice_data =  substr($data,$start + $start_index, $end - ($start + $start_index) );

    return $splice_data;

}


function splice_list ($img_data, $start_str, $end_str, $find = '', $replace = '')
{

    for ($i = 1, $j = 1; stripos($img_data,$start_str,$i) > 1 ;)
    {

    $start = stripos($img_data,$start_str,$i);

    $start_len =  strlen($start_str);

    $end = stripos($img_data,$end_str,$start + $start_len) ;

    $data_list[] = str_replace($find,$replace,substr($img_data, $start + $start_len , $end - $start - $start_len)) ;

    $i = $end; 

    $j++;

    }

    $result = array_unique($data_list);

    return $result; 
}


function get_data($url, $ckfile="", $cookie="")
{

$toCheckURL = $url;
// This all sets up the CURL actions to check the page

$header=array(
 // 'User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12',
  'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language: en-us,en;q=0.5',
  'Accept-Encoding: gzip,deflate',
  'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  'Keep-Alive: 115',
  'Connection: keep-alive',
);


$proxies = array();

$ch = curl_init();


if (isset($proxy)) {    // If the $proxy variable is set, then
    curl_setopt($ch, CURLOPT_PROXY, $proxy);    // Set CURLOPT_PROXY with proxy in $proxy variable
}



curl_setopt($ch, CURLOPT_URL, $toCheckURL);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY, false);

if (isset($ckfile) && $ckfile !="" and !empty($ckfile))
{
    curl_setopt ($ch, CURLOPT_COOKIEFILE, $ckfile);
}

curl_setopt($ch,CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT,60);
curl_setopt($ch, CURLOPT_TIMEOUT,90);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10); //follow up to 10 redirections - avoids loops
if($cookie != "")
    curl_setopt($ch,CURLOPT_HTTPHEADER, array($cookie));

curl_setopt($ch,CURLOPT_USERAGENT,$agents[array_rand($agents)]);


$data = curl_exec($ch);

curl_close($ch);

return $data;

}

我现在得到的输出是:

**** Got Product List 8 *** Array ( [0] => //netdna.bootstrapcdn.com/twitter-
bootstrap/2.3.2/css/bootstrap-combined.no-icons.min.css [1] => //netdna.bootstrapcdn.com/font-
awesome/3.2.1/css/font-awesome.css [2] => 
http://www.reebonz.com.sg/sites/all/themes/custom/octopus2/xfavicon.ico.pagespeed.ic.jT8Y7LgYBc.png 
[3] => http://www.octopus2.local/sites/all/themes/custom/octopus2/css/reebonz-ie.css [4] =>
 http://www.reebonz.com.sg/sites/all/modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.core.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.theme.min.css,qn1a78z+modules,_contrib,_jquery_update,_replace,_ui,_themes,_base,_minified,_jquery.ui.slider.min.css,qn1a78z+modules,_contrib,_panels,_css,_panels.css,qn1a78z+modules,_custom,_mailcheck,_css,_mailcheck.css,qn1a78z+themes,_custom,_octopus2,_css,_bootstrap.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-core.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-social-network.css,qn1a78z+themes,_custom,_octopus2,_css,_reebonz-....

我的代码有什么问题?有没有简单的方法呢?

3 个答案:

答案 0 :(得分:0)

使用php DomDocument:

$doc = new DOMDocument();
$doc->loadHTML(your_html_code);
$images = $doc->getElementsByTagName('img');
foreach ($images as $img) {
    //do whatever you like
}

答案 1 :(得分:0)

下载此库:http://sourceforge.net/projects/simplehtmldom/

以下代码将起作用

(在顶部包含该库)

<?php
error_reporting(1);
include_once('simple_html_dom.php');
$html = new simple_html_dom();

$html->load_file('https://www.google.co.in/search?q=shahrukh+khan&newwindow=1&biw=1375&bih=791&source=lnms&tbm=isch&sa=X&sqi=2&ved=0ahUKEwi1rO6AjZrKAhWSBY4KHWSGBDQQ_AUIBygC');
$reviews = $html->find('img');

$fetched_images = '';
foreach($reviews as $link)
{
  //find review ID if not null
  if($link->{'src'} != ''){
      $review_ID = $link->{'src'};
      $fetched_images[] = $review_ID;
  }
}
?>
<ul>
  <?php foreach ($fetched_images as $fetched_image): ?>
    <li style="display:inline-block"><img src="<?php echo $fetched_image;?>"></li>  
  <?php endforeach ?>

</ul>

答案 2 :(得分:0)

<?php
    include_once('simple_html_dom.php');
    $target_url = "Your URL here";
    $html = new simple_html_dom();
    $html->load_file($target_url);
    $images = $html->find('img');
    /**foreach($images as $link){
      //find review ID if not null
      if($link->{'src'} != ''){
          $image_ID = $link->{'src'};
          $fetched_images[] = $image_ID;
      }
    }*/
    foreach ($images as $fetched_image){
        echo $fetched_image; 
      }

?>