如何获取title元素的文本值? 甚至可以使用Dom Element吗? 我是否必须手工解析文本?
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
import re
import urllib.request
from xml.dom import minidom
def download(url):
with urllib.request.urlopen(url) as res:
return res.read().decode('latin-1')
class RSSFeed(object):
def __init__(self, url):
self.url = url
self.raw_xml = download(url)
self.dom = minidom.parseString(self.raw_xml)
self.links = self.dom.getElementsByTagName('link')
def entries(self):
ret = {}
for element in self.dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0]
print(title.toprettyxml())
def __str__(self):
return self.dom.toprettyxml()
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=500'
feed = RSSFeed(feed_url)
dom = feed.dom
print(feedHow totries())
答案 0 :(得分:2)
确定任何XML元素的节点值(即文本内容)的规范方法是
Minidom莫名其妙地没有实现这个程序,所以如果你必须使用minidom,你需要自己动手。
所以我们需要一些辅助函数。
让我们在模块中收集它们。
# minidom_helpers.py
def get_descendant_nodes(context_node, predicate):
if not context_node:
yield None
for child in context_node.childNodes:
if predicate(child):
yield child
yield from get_descendant_nodes(child, predicate)
def get_text_value(context_node, default=None):
texts_nodes = get_descendant_nodes(context_node, lambda n: n.nodeType == n.TEXT_NODE)
text_value = ' '.join([str.strip(t.nodeValue) for t in texts_nodes])
return text_value if text_value else default
def get_first_child(context_node, element_name):
elems = context_node.getElementsByTagName(element_name)
return elems[0] if elems else None
现在我们可以做到
import re
import urllib.request
from xml.dom import minidom
from minidom_helpers import *
class RSSFeed(object):
def __init__(self, url):
self.url = url
self.dom = minidom.parse(urllib.request.urlopen(url))
self.links = self.dom.getElementsByTagName('link')
def entries(self):
for entry in self.dom.getElementsByTagName('entry'):
yield {
"title": get_text_value(get_first_child(entry, 'title'))
}
def __str__(self):
return self.dom.toprettyxml()
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=500'
feed = RSSFeed(feed_url)
for entry in feed.entries():
print(entry)
解析XML的一般说明。尝试养成将XML视为二进制数据而不是文本的习惯。
XML解析器实现了一种自动计算文件编码的复杂机制。通过尝试提前将文件或HTTP响应解码为字符串来绕过该机制并不是必要的且不聪明:
# BAD CODE, DO NOT USE
def download(url):
with urllib.request.urlopen(url) as res:
return res.read().decode('latin-1')
raw_xml = download(url)
dom = minidom.parseString(self.raw_xml)
以上内容对文件编码进行了硬编码(,在您的情况下:错误)假设,并且当服务器因某种原因决定以UTF-16开始发送文件时会中断。 / p>
如果您将XML视为二进制数据而不是文本,那么它会变得更容易,也更加健壮。
dom = minidom.parse(urllib.request.urlopen(url))
XML解析器将嗅探字节并确定它们所处的编码。
从文件中读取XML也是如此。而不是
# BAD CODE, DO NOT USE
with open(path, 'r', encoding='latin-1') as fp:
dom = minidom.parseString(fp.read())
使用
with open(path, 'rb') as fp:
dom = minidom.parse(fp)
或只是
dom = minidom.parse(path)
答案 1 :(得分:0)
def entries(self):
for element in self.dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0].firstChild.nodeValue
link = element.getElementsByTagName('link')[0].getAttribute('href')
author = element.getElementsByTagName('name')[0].firstChild.nodeValue
article = element.getElementsByTagName('content')[0].firstChild
yield type('Entry', (object,), dict(title=title, link=link, author=author, article=article))
答案 2 :(得分:0)
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
import urllib.request
from xml.dom import minidom
def parse_feed(url):
with urllib.request.urlopen(url) as res:
dom = minidom.parseString(res.read().decode('latin-1'))
for element in dom.getElementsByTagName('entry'):
title = element.getElementsByTagName('title')[0].firstChild.nodeValue
link = element.getElementsByTagName('link')[0].getAttribute('href')
author = element.getElementsByTagName('name')[0].firstChild.nodeValue
article = element.getElementsByTagName('content')[0].firstChild.nodeValue
yield type('Entry', (object,), dict(title=title, link=link, author=author, article=article))
feed_url = 'https://rickys-python-notes.blogspot.com/atom.xml?redirect=false&start-index=1&max-results=500'
for entry in parse_feed(feed_url):
print(entry.title, entry.link)