Python RSS阅读器模块添加

时间:2018-06-20 15:18:45

标签: python rss-reader

我正在尝试从xml格式的rss feed中读取其他模块。利用现有的python脚本,我能够使用模块标签捕获一系列变量。我的研究表明,有效的模块包括作者标签和评论标签。但是,当我尝试编辑脚本以包括这些脚本时,它将失败。每当我尝试在命令行中运行编辑时,都会出现缩进错误。

这是成功的代码:

import tempfile, os, hashlib, re, sys
from datetime import datetime
import email.utils
import scalatools as st
import scalalib as sl
from HTMLParser import HTMLParser
parser = HTMLParser()
tempf = tempfile.gettempdir()
svars = sl.sharedvars()

filename = 'feed.xml'
feedname = ''
lengthmax = 480 # character limit
totalmax = 0 # limit to n items (0 = max)
feedrefreshtime = 10 # minutes to check feed for updates
itemtimerange = 0 # minutes to filter old items (0 = don't filter)
username = '' # HTTP authentication user (if required)
userpassword = '' # HTTP authentication password (if required)
itemimgurl = []
itemimgfile = []
itemtitle = []
itempubdate = []
itemdesc = []
itemqrfile = []
itemauthor = []
cachedfiles = []

try: # 3rd party module
    from PyQRNative import *
except ImportError:
    from scalatools import pip  # attempt dynamic install
    pip('install', 'PyQRNative')
    from PyQRNative import *

def cleanup_text(txt):
    # remove unnecessary clutter from text
    if txt:
        tmp = txt.strip() # strip leading/training spaces
        tmp = re.sub('<[^>]+>', '', tmp) # remove <html> first pass
        tmp = parser.unescape(tmp) # remove <html> 2nd pass
        tmp = tmp.replace("\r"," ") # remove CR
        tmp = tmp.replace("\n"," ") # remove LF
        tmp = tmp.replace("\t"," ") # remove tabs
        tmp = re.sub(' +', ' ', tmp) # remove extra spaces
        return tmp
    else:
        return ''

def cleanup_description(txt):
    # fix and shorten description text
    global lengthmax
    if txt:
        tmp = cleanup_text(txt)
        tmp = smart_truncate(tmp, lengthmax)
        return tmp

def cleanup_filename(txt):
    # make sure file name is valid
    if txt:
        tmp = cleanup_text(txt)
        tmp = tmp.replace(' ','_') # replace spaces with underscores
        tmp = re.sub('[\\\/:*?. \x22 <>|=]', '', tmp) # remove non Windows-friendly chars
        return tmp
    else:
        return ''

def smart_truncate(txt, length=160, suffix='...'):
    # round a string to the nearest word
    if txt:
        if len(txt) <= length:
            return txt
        else:
            return ' '.join(txt[:length+1-len(suffix)].split(' ')[0:-1]) + suffix
    else:
        return ''

def convert_pubdate(datestr):
    # convert RFC-822 date string to datetime value
    if datestr:
        parsedt = email.utils.parsedate_tz(datestr) # convert to tuple
        dateval = datetime.fromtimestamp(email.utils.mktime_tz(parsedt)) # convert to datetime
        return dateval
    else:
        return None

def time_ago(datestr):
    # convert date to user-friendly text
    minute = 60
    hour = minute * 60
    day = hour * 24
    week = day * 7
    dateval = convert_pubdate(datestr)
    if dateval:
        sincedate = datetime.now() - dateval
        sincesecs = sincedate.days * day + sincedate.seconds
        if 0 < sincesecs < 2:
            return 'right now'
        elif 0 < sincesecs < minute:
            return '%s seconds ago' % sincesecs
        elif 0 < sincesecs < minute * 2:
            return 'a minute ago'
        elif 0 < sincesecs < hour:
            return '%s minutes ago' % (sincesecs / minute)
        elif 0 < sincesecs < hour * 2:
            return 'an hour ago'
        elif 0 < sincesecs < day:
            return '%s hours ago' % (sincesecs / hour)
        elif 0 < sincesecs < day * 2:
            return 'a day ago'
        elif 0 < sincesecs < week:
            return '%s days ago' % (sincesecs / day)
        else:
            return dateval.strftime('%x')
    else:
        return ''

def is_current(datestr, minimum_age):
    # check if date is within minimum_age (mins)
    iscurrent = True
    dateval = convert_pubdate(datestr)
    if minimum_age > 0 and dateval:
        sincedate = datetime.now() - dateval
        sincemins = sincedate.days * 1440 + sincedate.seconds // 60
        iscurrent = sincemins <= minimum_age
    return iscurrent

def get_ext(txt):
    # choose correct file extension
    tmp = txt
    if 'image' in txt: tmp = '.jpg'
    elif 'video' in txt: tmp = '.mp4'
    elif 'flash' in txt: tmp = '.swf'
    return tmp

def hashme(txt):
    # generate a unique hash string
    digest = hashlib.md5()
    digest.update(txt.encode('utf-8'))
    return digest.hexdigest()

def create_qr(txt):
    # create qr image from text string
    img_file = ''
    global feedname
    if txt:
        img_file = os.path.join(tempf, feedname, hashme(txt) + '.jpg')
        if not os.path.exists(img_file):
            qr = QRCode(4, QRErrorCorrectLevel.L)
            qr.addData(txt[:60])
            qr.make()
            im = qr.makeImage()
            im.save(img_file)
    return img_file

def read_data(feedname, feedurl):
    # main function
    feedtotal = 0
    savefile = ''
    global feedrefreshtime, itemtimerange, totalmax, username, userpassword
    # create feed folder (if necessary)
    tempdir = os.path.join(tempf, feedname)
    if not os.path.exists(tempdir):
        os.makedirs(tempdir)
    # download feed
    try:
        st.grab_url(feedurl, filename=filename, tmpfolder=feedname, timeout=5,
                    username=username, password=userpassword, minutes=feedrefreshtime)
    except:
        pass
    # list cached files from feed folder
    cachedfiles = os.listdir(tempdir)
    # exclude feed xml
    try:
        cachedfiles.remove(filename)
    except:
        pass
    # check if feed is present
    try:
        savefile = st.find_file(filename, tmpfolder=feedname)
    except:
        pass
    # open feed
    if savefile:
        try:
            # look for <item> inside <channel>, then try from the root
            channel = st.DataChain(savefile, roottag='channel')
            if len(channel.items) == 0:
                channel = st.DataChain(savefile, roottag='')
            if scala5: svars.channel_title = channel.title
            # search for items with file urls
            for i, item in enumerate(channel.items):
                tmpitemimgurl = ''
                tmpitemext = ''
                if (i >= totalmax) and (totalmax > 0): break
                if is_current(getattr(item, 'pubDate', None), itemtimerange):
                    # <image>
                    if getattr(item, 'image', None):
                        tmpitemimgurl = item.image.url
                        tmpitemext = os.path.splitext(tmpitemimgurl)[1]
                    # <enclosure>
                    elif getattr(item, 'enclosure', None):
                        tmpitemimgurl = item.enclosure._url
                        if getattr(item.enclosure, '_type', None):
                            tmpitemext = get_ext(item.enclosure._type)
                        else:
                            tmpitemext = os.path.splitext(tmpitemimgurl)[1]
                    # <category>

                    # <endDate>
                    # <media:content>
                    elif getattr(item, 'content', None):
                        tmpitemimgurl = item.content._url
                        if getattr(item.content, '_medium', None):
                            tmpitemext = get_ext(item.content._medium)
                        elif getattr(item.content, '_type', None):
                            tmpitemext = get_ext(item.content._type)
                        else:
                            tmpitemext = os.path.splitext(tmpitemimgurl)[1]
                    # @src = inside <description> or <content:encoded>
                    if not tmpitemimgurl:
                        try:
                            matchObj = re.search(r'src\s*=\s*"([^"\?]*)[jpg|jpeg|png|gif]"', item.description, re.M|re.I)
                        except:
                            pass
                        if not matchObj:
                            try:
                                matchObj = re.search(r'src\s*=\s*"([^"\?]*)[jpg|jpeg|png|gif]"', item.encoded, re.M|re.I)
                            except:
                                pass
                        if matchObj:
                            tmpitemimgurl = matchObj.group(1)
                            tmpitemext = os.path.splitext(tmpitemimgurl)[1]
                    # add item image to list
                    if tmpitemimgurl:
                        itemimgurl.append(tmpitemimgurl)
                        itemimgfile.append(hashme(tmpitemimgurl)+tmpitemext)
                        try:
                            cachedfiles.remove(itemimgfile[-1])
                        except:
                            pass
                    else:
                        itemimgurl.append('')
                        itemimgfile.append('')
                    itemtitle.append(item.title)
                    itempubdate.append(item.pubDate)
                    itemdesc.append(cleanup_description(item.description))

                    # add item link to list as qrcode image
                    if item.link:
                        itemqrfile.append(item.link)
                        try:
                            cachedfiles.remove(os.path.basename(itemqrfile[-1]))
                        except:
                            pass
            # remove unused cached files
            for file in cachedfiles:
                os.unlink(os.path.join(tempdir, file))
            feedtotal = len(itemtitle)
        except Exception, e:
            log.error(e) # couldn't read file
            #st.msgbox(sys.exc_info()[-1].tb_lineno)
    else:
        # do not excessively harass server if file not found, write empty file and wait for next interval
        tempfile = os.path.join(tempdir, filename)
        if not os.path.exists(tempfile): 
            with open(tempfile, 'w') as f:
                pass
    return feedtotal

if __name__ == '__main__':          # Run from the command line (for testing)
    log = sl.get_logger(con=1, level='info')
    scala5 = None
    feedname = 'scalanews'
    feedurl = 'http://feeds.feedburner.com/ScalaDigitalSignage'
    total_items = read_data(feedname, feedurl)
    print 'It works *** Total Items: %s' % total_items

我是python的初学者,所以我大多只是尝试复制有效的方法。鉴于此,我尝试了itemauthor.append(item.author),因为这是其他标签的格式。我也尝试过getattr。两者都给了我测试缩进错误。

0 个答案:

没有答案