使用ElementTree Python获取XML attrib

时间:2013-12-11 19:32:34

标签: xml python-3.x xml-parsing elementtree

我正在尝试获取一个元素属性,但我得到的只是一个无值或空列表,具体取决于我尝试获取它。此外,如果有人知道更好的方法来获得元素的特定标签,我将不胜感激。 这是代码,间隔开的部分应该返回url但不是。

import xml.etree.ElementTree as ET
import webbrowser,time,urllib.request
import tkinter as tk
import urllib

# webbrowser.get('windows-default').open_new('http://www.reddit.com/'+'r/blender')
main = tk.Tk()
class Application(tk.Frame):



    def __init__(self, master=None):
        tk.Frame.__init__(self, master)
        self.pack()
        self.createWidgets()
        self.initial()

    def createWidgets(self):
        # print('Went to createWidgets()')
        self.send_entry = tk.Entry(self)
        self.send_entry.grid(row=0,column=0)
        self.change_sub = tk.Button(self,text='Change Subreddit', command=lambda :self.getXML(self.send_entry.get())).grid(row=0 , column=2)
        self.lb_scrollY = tk.Scrollbar(self,orient=tk.VERTICAL)
        self.lb_scrollY.grid(row=1,column=1,sticky=tk.NS)
        self.thread_lb = tk.Listbox(self,yscrollcommand=self.lb_scrollY.set)
        self.lb_scrollY['command']=self.thread_lb.yview
        self.thread_lb.grid(row=1,column=0)
        self.QUIT = tk.Button(self, text="QUIT", fg="red", command=main.destroy).grid(row=2)




    def descStripper(self,desc):
        x1=int(desc.find('alt="'))
        if x1 != -1:
            x2Start = x1+5
            x2=int(desc.find('"',x2Start))
            desc = desc[x1+5:x2]
            return desc
        else:
            desc = "There is no description. Maybe it's a link"
            return desc

    def lbPopulator(self,title,pub,link):
        # print('Went to lbPopulator()')
        self.thread_lb.delete(0,tk.END)
        for item in title:
            self.thread_lb.insert(tk.END,item)

    def getXmlData(self):
        counter = 0
        self.threadPubDateList = []
        self.threadTitleList = []
        self.threadLinkList = []
        self.threadDescList = []
        self.threadThumbNail = []
        tree=ET.parse('rss.xml')
        root=tree.getroot()
        for channel in root:
            for SubChannel in channel:
                if SubChannel.tag == 'item':
                    for threadInfo in SubChannel:
                        # print(SubChannel.getchildren())
                        if threadInfo.tag == 'title':
                            self.threadTitleList.append(threadInfo.text)
                        if threadInfo.tag == 'pubDate':
                            self.threadPubDateList.append(threadInfo.text[:-6])
                        if threadInfo.tag == 'link':
                            self.threadLinkList.append(threadInfo.text)
                        if threadInfo.tag == 'description':
                            self.threadDescList.append(self.descStripper(threadInfo.text))









                        if threadInfo.tag == '{http://search.yahoo.com/mrss/}title':
                            print(threadInfo.tag)
                            print(threadInfo.attrib)
                            print(threadInfo.get('url'))











        self.lbPopulator(self.threadTitleList,self.threadPubDateList,self.threadLinkList)
        # print(self.threadTitleList)
        # print(self.threadPubDateList)
        # print(self.threadLinkList)
        # print(self.threadDescList)
    def getXML(self,subreddit):
        try:
            url = 'http://www.reddit.com'+subreddit+'.rss'
            source = urllib.request.urlretrieve(url,'rss.xml')
            self.getXmlData()
        except urllib.error.HTTPError as err:
            print('Too many requests-Try again')
    def initial(self):
        try:
            source = urllib.request.urlretrieve('http://www.reddit.com/.rss','rss.xml')
            self.getXmlData()
        except urllib.error.HTTPError as err:
            print('Too many requests-Trying again 3')
            time.sleep(3)
            self.__init__()


# main.geometry("250x150")

app = Application(master=main)
app.mainloop()

这是传递XML文件时应该返回缩略图url的代码。这是所有最后的'if'陈述,其余的都可以正常工作。

def getXmlData(self):
    counter = 0
    self.threadPubDateList = []
    self.threadTitleList = []
    self.threadLinkList = []
    self.threadDescList = []
    self.threadThumbNail = []
    tree=ET.parse('rss.xml')
    root=tree.getroot()
    for channel in root:
        for SubChannel in channel:
            if SubChannel.tag == 'item':
                for threadInfo in SubChannel:
                    # print(SubChannel.getchildren())
                    if threadInfo.tag == 'title':
                        self.threadTitleList.append(threadInfo.text)
                    if threadInfo.tag == 'pubDate':
                        self.threadPubDateList.append(threadInfo.text[:-6])
                    if threadInfo.tag == 'link':
                        self.threadLinkList.append(threadInfo.text)
                    if threadInfo.tag == 'description':
                        self.threadDescList.append(self.descStripper(threadInfo.text))
                    if threadInfo.tag == '{http://search.yahoo.com/mrss/}title':
                        print(threadInfo.tag)
                        print(threadInfo.attrib)
                        print(threadInfo.get('url'))

1 个答案:

答案 0 :(得分:1)

唯一具有名为 url 属性的标记是 media:thumbnail 标记。正如您所指出的,mediaxmlns:media="http://search.yahoo.com/mrss/"的顶部被声明。这让我相信你的最后一个if语句应该是:

if threadInfo.tag == '{http://search.yahoo.com/mrss/}thumbnail':
   print(threadInfo.tag)
   print(threadInfo.attrib)
   print(threadInfo.get('url'))

哪个应产生输出:

'{http://search.yahoo.com/mrss/}thumbnail'
{'url' : 'http://a.thumbs.redditmedia.com/cozEqqG9muj-tT3Z.jpg'}
'http://a.thumbs.redditmedia.com/cozEqqG9muj-tT3Z.jpg'