我正在尝试从xml格式的rss feed中读取其他模块。利用现有的python脚本,我能够使用模块标签捕获一系列变量。我的研究表明,有效的模块包括作者标签和评论标签。但是,当我尝试编辑脚本以包括这些脚本时,它将失败。每当我尝试在命令行中运行编辑时,都会出现缩进错误。
这是成功的代码:
import tempfile, os, hashlib, re, sys
from datetime import datetime
import email.utils
import scalatools as st
import scalalib as sl
from HTMLParser import HTMLParser
parser = HTMLParser()
tempf = tempfile.gettempdir()
svars = sl.sharedvars()
filename = 'feed.xml'
feedname = ''
lengthmax = 480 # character limit
totalmax = 0 # limit to n items (0 = max)
feedrefreshtime = 10 # minutes to check feed for updates
itemtimerange = 0 # minutes to filter old items (0 = don't filter)
username = '' # HTTP authentication user (if required)
userpassword = '' # HTTP authentication password (if required)
itemimgurl = []
itemimgfile = []
itemtitle = []
itempubdate = []
itemdesc = []
itemqrfile = []
itemauthor = []
cachedfiles = []
try: # 3rd party module
from PyQRNative import *
except ImportError:
from scalatools import pip # attempt dynamic install
pip('install', 'PyQRNative')
from PyQRNative import *
def cleanup_text(txt):
# remove unnecessary clutter from text
if txt:
tmp = txt.strip() # strip leading/training spaces
tmp = re.sub('<[^>]+>', '', tmp) # remove <html> first pass
tmp = parser.unescape(tmp) # remove <html> 2nd pass
tmp = tmp.replace("\r"," ") # remove CR
tmp = tmp.replace("\n"," ") # remove LF
tmp = tmp.replace("\t"," ") # remove tabs
tmp = re.sub(' +', ' ', tmp) # remove extra spaces
return tmp
else:
return ''
def cleanup_description(txt):
# fix and shorten description text
global lengthmax
if txt:
tmp = cleanup_text(txt)
tmp = smart_truncate(tmp, lengthmax)
return tmp
def cleanup_filename(txt):
# make sure file name is valid
if txt:
tmp = cleanup_text(txt)
tmp = tmp.replace(' ','_') # replace spaces with underscores
tmp = re.sub('[\\\/:*?. \x22 <>|=]', '', tmp) # remove non Windows-friendly chars
return tmp
else:
return ''
def smart_truncate(txt, length=160, suffix='...'):
# round a string to the nearest word
if txt:
if len(txt) <= length:
return txt
else:
return ' '.join(txt[:length+1-len(suffix)].split(' ')[0:-1]) + suffix
else:
return ''
def convert_pubdate(datestr):
# convert RFC-822 date string to datetime value
if datestr:
parsedt = email.utils.parsedate_tz(datestr) # convert to tuple
dateval = datetime.fromtimestamp(email.utils.mktime_tz(parsedt)) # convert to datetime
return dateval
else:
return None
def time_ago(datestr):
# convert date to user-friendly text
minute = 60
hour = minute * 60
day = hour * 24
week = day * 7
dateval = convert_pubdate(datestr)
if dateval:
sincedate = datetime.now() - dateval
sincesecs = sincedate.days * day + sincedate.seconds
if 0 < sincesecs < 2:
return 'right now'
elif 0 < sincesecs < minute:
return '%s seconds ago' % sincesecs
elif 0 < sincesecs < minute * 2:
return 'a minute ago'
elif 0 < sincesecs < hour:
return '%s minutes ago' % (sincesecs / minute)
elif 0 < sincesecs < hour * 2:
return 'an hour ago'
elif 0 < sincesecs < day:
return '%s hours ago' % (sincesecs / hour)
elif 0 < sincesecs < day * 2:
return 'a day ago'
elif 0 < sincesecs < week:
return '%s days ago' % (sincesecs / day)
else:
return dateval.strftime('%x')
else:
return ''
def is_current(datestr, minimum_age):
# check if date is within minimum_age (mins)
iscurrent = True
dateval = convert_pubdate(datestr)
if minimum_age > 0 and dateval:
sincedate = datetime.now() - dateval
sincemins = sincedate.days * 1440 + sincedate.seconds // 60
iscurrent = sincemins <= minimum_age
return iscurrent
def get_ext(txt):
# choose correct file extension
tmp = txt
if 'image' in txt: tmp = '.jpg'
elif 'video' in txt: tmp = '.mp4'
elif 'flash' in txt: tmp = '.swf'
return tmp
def hashme(txt):
# generate a unique hash string
digest = hashlib.md5()
digest.update(txt.encode('utf-8'))
return digest.hexdigest()
def create_qr(txt):
# create qr image from text string
img_file = ''
global feedname
if txt:
img_file = os.path.join(tempf, feedname, hashme(txt) + '.jpg')
if not os.path.exists(img_file):
qr = QRCode(4, QRErrorCorrectLevel.L)
qr.addData(txt[:60])
qr.make()
im = qr.makeImage()
im.save(img_file)
return img_file
def read_data(feedname, feedurl):
# main function
feedtotal = 0
savefile = ''
global feedrefreshtime, itemtimerange, totalmax, username, userpassword
# create feed folder (if necessary)
tempdir = os.path.join(tempf, feedname)
if not os.path.exists(tempdir):
os.makedirs(tempdir)
# download feed
try:
st.grab_url(feedurl, filename=filename, tmpfolder=feedname, timeout=5,
username=username, password=userpassword, minutes=feedrefreshtime)
except:
pass
# list cached files from feed folder
cachedfiles = os.listdir(tempdir)
# exclude feed xml
try:
cachedfiles.remove(filename)
except:
pass
# check if feed is present
try:
savefile = st.find_file(filename, tmpfolder=feedname)
except:
pass
# open feed
if savefile:
try:
# look for <item> inside <channel>, then try from the root
channel = st.DataChain(savefile, roottag='channel')
if len(channel.items) == 0:
channel = st.DataChain(savefile, roottag='')
if scala5: svars.channel_title = channel.title
# search for items with file urls
for i, item in enumerate(channel.items):
tmpitemimgurl = ''
tmpitemext = ''
if (i >= totalmax) and (totalmax > 0): break
if is_current(getattr(item, 'pubDate', None), itemtimerange):
# <image>
if getattr(item, 'image', None):
tmpitemimgurl = item.image.url
tmpitemext = os.path.splitext(tmpitemimgurl)[1]
# <enclosure>
elif getattr(item, 'enclosure', None):
tmpitemimgurl = item.enclosure._url
if getattr(item.enclosure, '_type', None):
tmpitemext = get_ext(item.enclosure._type)
else:
tmpitemext = os.path.splitext(tmpitemimgurl)[1]
# <category>
# <endDate>
# <media:content>
elif getattr(item, 'content', None):
tmpitemimgurl = item.content._url
if getattr(item.content, '_medium', None):
tmpitemext = get_ext(item.content._medium)
elif getattr(item.content, '_type', None):
tmpitemext = get_ext(item.content._type)
else:
tmpitemext = os.path.splitext(tmpitemimgurl)[1]
# @src = inside <description> or <content:encoded>
if not tmpitemimgurl:
try:
matchObj = re.search(r'src\s*=\s*"([^"\?]*)[jpg|jpeg|png|gif]"', item.description, re.M|re.I)
except:
pass
if not matchObj:
try:
matchObj = re.search(r'src\s*=\s*"([^"\?]*)[jpg|jpeg|png|gif]"', item.encoded, re.M|re.I)
except:
pass
if matchObj:
tmpitemimgurl = matchObj.group(1)
tmpitemext = os.path.splitext(tmpitemimgurl)[1]
# add item image to list
if tmpitemimgurl:
itemimgurl.append(tmpitemimgurl)
itemimgfile.append(hashme(tmpitemimgurl)+tmpitemext)
try:
cachedfiles.remove(itemimgfile[-1])
except:
pass
else:
itemimgurl.append('')
itemimgfile.append('')
itemtitle.append(item.title)
itempubdate.append(item.pubDate)
itemdesc.append(cleanup_description(item.description))
# add item link to list as qrcode image
if item.link:
itemqrfile.append(item.link)
try:
cachedfiles.remove(os.path.basename(itemqrfile[-1]))
except:
pass
# remove unused cached files
for file in cachedfiles:
os.unlink(os.path.join(tempdir, file))
feedtotal = len(itemtitle)
except Exception, e:
log.error(e) # couldn't read file
#st.msgbox(sys.exc_info()[-1].tb_lineno)
else:
# do not excessively harass server if file not found, write empty file and wait for next interval
tempfile = os.path.join(tempdir, filename)
if not os.path.exists(tempfile):
with open(tempfile, 'w') as f:
pass
return feedtotal
if __name__ == '__main__': # Run from the command line (for testing)
log = sl.get_logger(con=1, level='info')
scala5 = None
feedname = 'scalanews'
feedurl = 'http://feeds.feedburner.com/ScalaDigitalSignage'
total_items = read_data(feedname, feedurl)
print 'It works *** Total Items: %s' % total_items
我是python的初学者,所以我大多只是尝试复制有效的方法。鉴于此,我尝试了itemauthor.append(item.author),因为这是其他标签的格式。我也尝试过getattr。两者都给了我测试缩进错误。