在PYTHON中解析rss xml文件

时间:2011-11-08 04:43:36

标签: python xml parsing

我实际上是在尝试从RSS文档中提取数据。我使用以下代码来解析xml doc。

但不适用于此文件http://www.mediafire.com/?hptptj8847awnn1。请帮忙!!

#import easy to use xml parser called minidom:
import xml.dom.minidom as minidom
import csv

def getTags(xml):
"""
Print out all titles found in xml
"""

doc = minidom.parse(xml)



node = doc.documentElement
items = doc.getElementsByTagName("item")

titles = []
for item in items:
    titleObj = item.getElementsByTagName("title")[0]
    titles.append(titleObj)


print len(titles)

x = 0
for x in range(len(titles)):
    nodes = titles[x].childNodes
    for node in nodes:
        if node.nodeType == node.CDATA_SECTION_NODE:
            titletxt = node.data

        elif node.nodeType == node.TEXT_NODE:
            titletxt = node.data

if __name__ == "__main__":
    document = 'D2B0918.xml'
    getTags(document)

2 个答案:

答案 0 :(得分:0)

如果你想特别解析RSS,我会谦虚地指向优秀的feedparser库,它可能会做你想要的,然后是一些。

http://code.google.com/p/feedparser/

答案 1 :(得分:0)

#-*-coding:utf8;-*-
#qpy:3
#qpy:console

import urllib.request
from xml.dom import minidom


def parse_feed(url):
    # This is what parse_feed returns.
    feed = type('Feed', (object,), {})
    feed.entries = []

    with urllib.request.urlopen(url) as res:
        dom = minidom.parseString(res.read().decode('latin-1'))
        feed.title = dom.getElementsByTagName('title')[0].firstChild.nodeValue
        feed.link = dom.getElementsByTagName('link')[0].getAttribute('href')
        feed.published = dom.getElementsByTagName('published')[0].firstChild.nodeValue

    for element in dom.getElementsByTagName('entry'):
        title = element.getElementsByTagName('title')[0].firstChild.nodeValue
        link = element.getElementsByTagName('link')[0].getAttribute('href')
        author = element.getElementsByTagName('name')[0].firstChild.nodeValue
        published = element.getElementsByTagName('published')[0].firstChild.nodeValue
        updated = element.getElementsByTagName('updated')[0].firstChild.nodeValue
        _id = element.getElementsByTagName('id')[0].firstChild.nodeValue
        category = element.getElementsByTagName('category')

        tags = []
        for node in category:
            tags.append(node.getAttribute('term'))

        article = element.getElementsByTagName('content')[0].firstChild.nodeValue

        entry_dict = dict(
                    title=title, 
                    link=link, 
                    author=author, 
                    article=article,
                    tags=tags,
                    _id=_id)

        feed.entries.append(type('Entry', (feed,), entry_dict))

    return feed


# Example use.
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=1000'
feed = parse_feed(feed_url)
print(feed.title)
print(feed.updated)
for entry in feed.entries:
    print(entry.title)
    print(entry.link)