RSS解析器包含类别

时间:2013-09-18 15:34:15

标签: php arrays rss rss-reader

我最近继承了RSS / XML解析器,虽然看起来效果很好,但我发现有些东西丢失了。

例如,从博客中提取RSS源。它缺少项目中的所有类别。它显示每个项目只有一个类别,而实际上它应该显示为具有多个类别。

演示链接:http://dev.o7t.in/rss/

链接到实际Feed:http://o7thblog.com/feed/

您可以看到feed中的第一项如何在第一项中总共有8个类别。 (可能需要查看来源)

但是,在Demo中,您可以看到它只显示1个类别

这是我的全部代码:

<?php

class o7thRssFeedPuller{

    public $FeedUrl = ''; // URL of the feed to pull in
    public $ReturnJson = false; // Return the array as a JSON encoded string instead?
    public $MaxItems = 0; // 0 = unlimited (except by feed), only applicable to GetItems

    // Internal holders
    private $document;
    private $channel;
    private $items;

    // Get the full RSS feed
    public function GetRSS($includeAttributes = false) {
        // Pull in our feed
        $this->loadParser(file_get_contents($this->FeedUrl, false, $this->randomContext()));
        if($includeAttributes) {
            // only if we are including attributes
            return ($this->ReturnJson) ? json_encode($this->document) : $this->document;
        }
        // Return either an array or a json encoded string
        return ($this->ReturnJson) ? json_encode($this->valueReturner()) : $this->valueReturner();
    }

    // Get the channel data
    public function GetChannel($includeAttributes = false) {
        // Pull in our feed
        $this->loadParser(file_get_contents($this->FeedUrl, false, $this->randomContext()));
        if($includeAttributes) {
            // only if we are including attributes
            return ($this->ReturnJson) ? json_encode($this->channel) : $this->channel;
        }
        // Return either an array or a json encoded string
        return ($this->ReturnJson) ? json_encode($this->valueReturner($this->channel)) : $this->valueReturner($this->channel);
    }

    // Get the items
    public function GetItems($includeAttributes=false) {
        // Pull in our feed
        $this->loadParser(file_get_contents($this->FeedUrl, false, $this->randomContext()));
        if($includeAttributes) {
            // only if we are including attributes
            $arr = ($this->MaxItems == 0) ? $this->items : array_slice($this->items, 0, $this->MaxItems);
            return ($this->ReturnJson) ? json_encode($arr) : $arr;
        }
        // Return either an array or a json encoded string
        $arr = ($this->MaxItems == 0) ? $this->valueReturner($this->items) : array_slice($this->valueReturner($this->items), 0, $this->MaxItems);
        return ($this->ReturnJson) ? json_encode($arr) : $arr;
    }


    // -------------------------------------------------------------------------------------------------
    // Internal Methods

    private function loadParser($rss=false) {
        if($rss) {
            $this->document = array();
            $this->channel = array();
            $this->items = array();
            $DOMDocument = new DOMDocument;
            $DOMDocument->strictErrorChecking = false;
            $DOMDocument->loadXML($rss);
            $this->document = $this->extractDOM($DOMDocument->childNodes);
        }
    }

    private function valueReturner($valueBlock=false) {
        if(!$valueBlock) {
            $valueBlock = $this->document;
        }
        foreach($valueBlock as $valueName => $values) {
            if(isset($values['value'])) {
                $values = $values['value'];
            }
            if(is_array($values)) {
                $valueBlock[$valueName] = $this->valueReturner($values);
            } else {
                $valueBlock[$valueName] = $values;
            }
        }
        return $valueBlock;
    }

    private function extractDOM($nodeList,$parentNodeName=false) {
        $itemCounter = 0;
        foreach($nodeList as $values) {
            if(substr($values->nodeName,0,1) != '#') {
                if($values->nodeName == 'item') {
                    $nodeName = $values->nodeName.':'.$itemCounter;
                    $itemCounter++;
                } else {
                    $nodeName = $values->nodeName;
                }
                $tempNode[$nodeName] = array();             
                if($values->attributes) {
                    for($i=0;$values->attributes->item($i);$i++) {
                        $tempNode[$nodeName]['properties'][$values->attributes->item($i)->nodeName] = $values->attributes->item($i)->nodeValue;
                    }
                }
                if(!$values->firstChild) {
                    $tempNode[$nodeName]['value'] = $values->textContent;
                } else {
                    $tempNode[$nodeName]['value']  = $this->extractDOM($values->childNodes, $values->nodeName);
                }
                if(in_array($parentNodeName, array('channel','rdf:RDF'))) {
                    if($values->nodeName == 'item') {
                        $this->items[] = $tempNode[$nodeName]['value'];
                    } elseif(!in_array($values->nodeName, array('rss','channel'))) {
                        $this->channel[$values->nodeName] = $tempNode[$nodeName];
                    }
                }
            } elseif(substr($values->nodeName,1) == 'text') {
                $tempValue = trim(preg_replace('/\s\s+/',' ',str_replace("\n",' ', $values->textContent)));
                if($tempValue) {
                    $tempNode = $tempValue;
                }
            } elseif(substr($values->nodeName,1) == 'cdata-section'){
                $tempNode = $values->textContent;
            }
        }
        return (!isset($tempNode)) ? null : $tempNode;
    }

    // Load in a random header to pass
    private function randomContext() {
        $headerstrings = array();
        $headerstrings['User-Agent'] = 'Mozilla/5.0 (Windows; U; Windows NT 5.'.rand(0,2).'; en-US; rv:1.'.rand(2,9).'.'.rand(0,4).'.'.rand(1,9).') Gecko/2007'.rand(10,12).rand(10,30).' Firefox/2.0.'.rand(0,1).'.'.rand(1,9);
        $headerstrings['Accept-Charset'] = rand(0,1) ? 'en-gb,en;q=0.'.rand(3,8) : 'en-us,en;q=0.'.rand(3,8);
        $headerstrings['Accept-Language'] = 'en-us,en;q=0.'.rand(4,6);
        $setHeaders =   'Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'."\r\n".
                        'Accept-Charset: '.$headerstrings['Accept-Charset']."\r\n".
                        'Accept-Language: '.$headerstrings['Accept-Language']."\r\n".
                        'User-Agent: '.$headerstrings['User-Agent']."\r\n";
        $contextOptions = array(
            'http'=>array(
                'method'=>"GET",
                'header'=>$setHeaders
            )
        );
        return stream_context_create($contextOptions);
    }

}

?>

对于演示页面:

<?php

    require_once($_SERVER['DOCUMENT_ROOT'] . '/rss/o7th.rss.feed.puller.php');

    $fp = new o7thRssFeedPuller();

    $fp->FeedUrl = 'http://o7thblog.com/feed';
    $fp->MaxItems = 2;

    echo '<table width="100%" cellpadding="0" cellspacing="0">';
    echo '  <tr>';

    echo '      <td>';
    echo '          <textarea cols="120" rows="30">';
    print_r($fp->GetItems());
    echo '          </textarea>';
    echo '      </td>';

    echo '  </tr>';
    echo '</table>';
?>

所以,我认为这个问题存在于valueReturner方法或extractDOM方法的某个地方,但我不确定在哪里,也不知道如何才能获得所有类别返回数组。

你能帮忙吗?

2 个答案:

答案 0 :(得分:2)

我建议使用SimpleXML来解析Feed。

以下是如何做到这一点:

$feed_url = 'http://o7thblog.com/feed/';
$feed = simplexml_load_file($feed_url, null, LIBXML_NOCDATA);
$channel = $feed->channel;
echo "<h1><a href=\"{$channel->link}\">{$channel->title}</a></h1>\n";
echo "{$channel->description}\n";
echo "<dl>\n";
foreach ($channel->item as $item) {
    echo "<dt><a href=\"{$item->link}\">{$item->title}</a></dt>\n"
    . "<dd style=\"margin-bottom: 30px;\"><div style=\"font-size: small;\">{$item->pubDate}</div>\n"
    . "<div>{$item->description}</div>\n"
    . "Categories: <strong>".implode('</strong>, <strong>', (array) $item->category) . "</strong>\n</dd>";
}
echo "</dl>\n";

上面显示了所有类别。

答案 1 :(得分:0)

您已编写了一个自定义解析器,只需使用一行代码即可完成!

$feed = (array) simplexml_load_file('http://o7thblog.com/feed/', null, LIBXML_NOCDATA);