我实际上是在尝试从RSS文档中提取数据。我使用以下代码来解析xml doc。
但不适用于此文件http://www.mediafire.com/?hptptj8847awnn1。请帮忙!!
#import easy to use xml parser called minidom:
import xml.dom.minidom as minidom
import csv
def getTags(xml):
"""
Print out all titles found in xml
"""
doc = minidom.parse(xml)
node = doc.documentElement
items = doc.getElementsByTagName("item")
titles = []
for item in items:
titleObj = item.getElementsByTagName("title")[0]
titles.append(titleObj)
print len(titles)
x = 0
for x in range(len(titles)):
nodes = titles[x].childNodes
for node in nodes:
if node.nodeType == node.CDATA_SECTION_NODE:
titletxt = node.data
elif node.nodeType == node.TEXT_NODE:
titletxt = node.data
if __name__ == "__main__":
document = 'D2B0918.xml'
getTags(document)
答案 0 :(得分:0)
如果你想特别解析RSS,我会谦虚地指向优秀的feedparser
库,它可能会做你想要的,然后是一些。
答案 1 :(得分:0)
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
import urllib.request
from xml.dom import minidom
def parse_feed(url):
# This is what parse_feed returns.
feed = type('Feed', (object,), {})
feed.entries = []
with urllib.request.urlopen(url) as res:
dom = minidom.parseString(res.read().decode('latin-1'))
feed.title = dom.getElementsByTagName('title')[0].firstChild.nodeValue
feed.link = dom.getElementsByTagName('link')[0].getAttribute('href')
feed.published = dom.getElementsByTagName('published')[0].firstChild.nodeValue
for element in dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0].firstChild.nodeValue
link = element.getElementsByTagName('link')[0].getAttribute('href')
author = element.getElementsByTagName('name')[0].firstChild.nodeValue
published = element.getElementsByTagName('published')[0].firstChild.nodeValue
updated = element.getElementsByTagName('updated')[0].firstChild.nodeValue
_id = element.getElementsByTagName('id')[0].firstChild.nodeValue
category = element.getElementsByTagName('category')
tags = []
for node in category:
tags.append(node.getAttribute('term'))
article = element.getElementsByTagName('content')[0].firstChild.nodeValue
entry_dict = dict(
title=title,
link=link,
author=author,
article=article,
tags=tags,
_id=_id)
feed.entries.append(type('Entry', (feed,), entry_dict))
return feed
# Example use.
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=1000'
feed = parse_feed(feed_url)
print(feed.title)
print(feed.updated)
for entry in feed.entries:
print(entry.title)
print(entry.link)