Php刮刀提取数据只在标签之间

时间:2016-03-29 11:13:18

标签: php web-scraping

尝试制作一个刮刀,首先需要在<a>标签之间刮取数据。我稍微修改了代码以提取其间的数据。这是代码。

<?php

  function scrape_between($data, $start, $end){
    $data = stristr($data, $start); 
    $data = substr($data, strlen($start)); 
    $stop = strpos($data, $end); 
    $data = substr($data, 0, $stop); 
    return $data; 
}

function cURL($url) {
    $options = array(
        CURLOPT_RETURNTRANSFER => TRUE, 
        CURLOPT_FOLLOWLOCATION => TRUE, 
        CURLOPT_AUTOREFERER => TRUE, 
        CURLOPT_CONNECTTIMEOUT => 120, 
        CURLOPT_MAXREDIRS => 10, 
        CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1", 
        CURLOPT_URL => $url,
    );
    $ch = curl_init(); 
    curl_setopt_array($ch, $options); 
    $data = curl_exec($ch); 
    curl_close($ch); 
    return $data; 
}
    $url = "http://www.imdb.com/search/title?genres=action"; 
    $results_page = curl($url); 
    $results_page = scrape_between($results_page, "<div id=\"main\">", "<div id=\"sidebar\">"); 
    $separate_results = explode("<td class=\"title\">", $results_page);
    foreach ($separate_results as $separate_result) {
        if ($separate_result != " ") {
        $results_urls[] = "http://www.imdb.com " . scrape_between($separate_result, "<a href=", "a>");
        }
    }
    print_r($results_urls);
?>

我正在寻找的是使数据以电影标题的列表形式出现。实现这一目标的正确方法是什么,因为我不太确定是否会使用正则表达式。

1 个答案:

答案 0 :(得分:1)

这可能有所帮助。

  

它将IMDB电影元值(例如图像,标题和大纲)提取到   使用PHP DOMDocumentcurl的PHP数组[与单独的   用于提取任何内部HTML内容和属性值的函数   标记(通过匹配id,标记名称和类)。]:

<?php

$dom = new DOMDocument;

function disguise_curl($url)
{
  $curl = curl_init();
  curl_setopt($curl, CURLOPT_URL, $url);
  curl_setopt($curl, CURLOPT_AUTOREFERER, true);
  curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
  $html= curl_exec($curl);
  if($html=== false)
  {
      if($errno = curl_errno($curl)){
      $error_message = curl_strerror($errno);
      $html= "cURL error ({$errno}): {$error_message}\n";
      }
  }
  curl_close($curl);

  return $html;
}

function scrape_between($data, $start, $end){
        $data = stristr($data, $start);
        $data = substr($data, strlen($start));
        $stop = stripos($data, $end); 
        $data = substr($data, 0, $stop);
        return $data;
    }

function getHTMLByID($id, $html) {
    $dom = new DOMDocument;
    libxml_use_internal_errors(true);
    $dom->validateOnParse = true;
    $dom->loadHTML($html);
    $node = $dom->getElementById($id);
    if($node) {
        return $dom->saveHTML($node);
    }
    return FALSE;
}

function getHTMLByClass($class, $html, $bring_tag=false){
    $dom = new DOMDocument;
    libxml_use_internal_errors(true);
    $dom->validateOnParse = true;
    $dom->loadHTML($html);
    $class_arr= array();
    $xpath= new DOMXPath($dom);
    $results = $xpath->query("//*[contains(@class, '$class')]");
    if($results->length > 0){
        foreach($results as $tag)
        {
            if($bring_tag===true)
                array_push($class_arr, $tag);
            else
                array_push($class_arr, $dom->saveHTML($tag));
        }
    }    
    return $class_arr;
}

function get_domattr($html, $tag, $attr)
{
    $attr_vals= array();
    if(!empty($html))
    {
    $dom = new DOMDocument;
    libxml_use_internal_errors(true);
    $dom->validateOnParse = true;
    $dom->loadHTML($html);
    foreach($dom->getElementsByTagName($tag) as $img)
    array_push($attr_vals, $img->getAttribute($attr));
    }
    return $attr_vals;
}

function getHTMLByTag($tag, $html) {
    $attr_vals= array();
    if(!empty($html))
    {
    global $dom;
    libxml_use_internal_errors(true);
    $dom->validateOnParse = true;
    $dom->loadHTML($html);

    foreach($dom->getElementsByTagName($tag) as $taghtml)
        array_push($attr_vals, $dom->saveXML($taghtml));
    }
    return $attr_vals;
}

$url= "http://www.imdb.com/search/title?genres=action";
$page_html= disguise_curl($url);


$result_html= getHTMLByClass('image', $page_html);

$movie_list= array();
$i=0;
foreach($result_html as $cont_tag)
{
    $img_link= get_domattr($cont_tag, 'img', 'src');
    if((!isset($img_link)) || (empty($img_link)))
        $movie_list[$i]['photo']= 'na';
    else
        $movie_list[$i]['photo']= $img_link[0];

    ++$i;
}

$result_html= getHTMLByClass('title', $page_html);
$link_pre= 'http://imdb.com';

$i=0;
foreach($result_html as $cont_tag)
{
    $mtitle= getHTMLByTag('a', $cont_tag);
    if((!isset($mtitle)) || (empty($mtitle)))
        $movie_list[$i]['title']= 'na';
    else
        $movie_list[$i]['title']= $mtitle[0];

    $mlink= get_domattr($cont_tag, 'a', 'href');
    if((!isset($mlink)) || (empty($mlink)))
        $movie_list[$i]['link']= 'na';
    else
        $movie_list[$i]['link']= $link_pre.''.$mlink[0];


    $moutline= getHTMLByClass('outline', $cont_tag);
    if((!isset($moutline)) || (empty($moutline)))
        $movie_list[$i]['outline']= 'na';
    else
        $movie_list[$i]['outline']= $moutline[0];




    ++$i;
}

echo '<pre>';
print_r($movie_list);
echo '</pre>';
?>

示例输出:

Array
(
    [0] => Array
        (
            [photo] => http://ia.media-imdb.com/images/M/MV5BMjQ0MTgyNjAxMV5BMl5BanBnXkFtZTgwNjUzMDkyODE@._V1._SX54_CR0,0,54,74_.jpg
            [title] => Captain America: Civil War
            [link] => http://imdb.com/title/tt3498820/
            [outline] => Political interference in the Avengers' activities causes a rift between former allies Captain America and Iron Man.
        )

    [1] => Array
        (
            [photo] => http://ia.media-imdb.com/images/M/MV5BNTE5NzU3MTYzOF5BMl5BanBnXkFtZTgwNTM5NjQxODE@._V1._SX54_CR0,0,54,74_.jpg
            [title] => Batman v Superman: Dawn of Justice
            [link] => http://imdb.com/title/tt2975590/
            [outline] => Fearing the actions of Superman are left unchecked, Batman takes on the man of steel, while the world wrestles with what kind of a hero it really needs. With Batman and Superman fighting each other, a new threat, Doomsday, is created by Lex Luthor. It's up to Superman and Batman to set aside their differences along with Wonder Woman to stop Lex Luthor and Doomsday from destroying Metropolis.
        )

    [2] => Array
        (
            [photo] => http://ia.media-imdb.com/images/M/MV5BMTY0MDY0NjExN15BMl5BanBnXkFtZTgwOTU3OTYyODE@._V1._SX54_CR0,0,54,74_.jpg
            [title] => na
            [link] => na
            [outline] => na
        )
)