如何使用php在cURL中显示href

时间:2013-09-08 18:38:19

标签: php curl

我正在学习Scrapping网页,之前我使用的是Simple HTML DOM Parser,但它太慢了。所以我选择了cURL。我通过一些博客学习。现在我想在两个标签之间显示href。

<?php
class tagSpider
{
var $crl;
var $html;
var $binary; 
var $url;

function tagSpider()
{
$this->html = "";
$this->binary = 0;
$this->url = "";
}

function fetchPage($url)
{
$this->url = $url;
if (isset($this->url)) {
$this->ch = curl_init ();
curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($this->ch, CURLOPT_URL, $this->url); 
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, true); 
curl_setopt($this->ch, CURLOPT_BINARYTRANSFER, $this->binary); 
$this->html = curl_exec($this->ch); 
curl_close ($this->ch); 
}
}

function parse_array($beg_tag, $close_tag)
{
preg_match_all("($beg_tag.*$close_tag)siU", $this->html, $matching_data); 
return $matching_data[0];
}
}
?>

<?php
$urlrun="http://m4.cricbuzz.com/";
$stag='<span>';
$etag="</span>";
$tspider = new tagSpider();
$tspider->fetchPage($urlrun);
$linkarray = $tspider->parse_array($stag, $etag); 
foreach ($linkarray as $result) {
echo strip_tags($result, '<br><div>');
echo "<br>-<br>";
}
?> 

如何使用相同的程序显示href

1 个答案:

答案 0 :(得分:0)

<?php

    class tagSpider {

        var $crl;
        var $html;
        var $binary;
        var $url;

        function tagSpider() {
            $this->html = "";
            $this->binary = 0;
            $this->url = "";
        }

        function fetchPage($url) {
            $this->url = $url;
            if (isset($this->url)) {
                $this->ch = curl_init();
                curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
                curl_setopt($this->ch, CURLOPT_URL, $this->url);
                curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, true);
                curl_setopt($this->ch, CURLOPT_BINARYTRANSFER, $this->binary);
                $this->html = curl_exec($this->ch);
                curl_close($this->ch);
            }
        }

        function parse_array($beg_tag, $close_tag)
        {
        preg_match_all("($beg_tag.*$close_tag)siU", $this->html, $matching_data); 
        return $matching_data[0];
        }
        function getLinks(  ) {
            $dom = new domDocument;
            @$dom->loadHTML($this->html);
            $dom->preserveWhiteSpace = false;
            $list_items  = $dom->getElementsByTagName('li');
            $href = array();
            foreach($list_items as $item){
                if($item->getAttribute('class')=='ui-li ui-btn-icon-right ui-btn-up-d ui-odd-match-column '){
                    $links = $item->getElementsByTagName('a');
                    foreach($links as $link ){
                        $href[] = $link->getAttribute('href');
                    }

                }
            }
            return $href;

        }

    }
?>

<?php

    $urlrun="http://m4.cricbuzz.com/";
    $stag = 'span';
    $etag = "</span>";
    $tspider = new tagSpider();
    $tspider->fetchPage($urlrun);
    $linkarray = $tspider->getLinks( ); 
    var_dump($linkarray);
?>