我在“编程集体智慧”一书中遇到了以下代码,名为newsfeatures.py。
以下是代码:
import feedparser
import re
feedlist=['http://today.reuters.com/rss/topNews',
'http://today.reuters.com/rss/domesticNews',
'http://today.reuters.com/rss/worldNews',
'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
'http://news.google.com/?output=rss',
'http://feeds.salon.com/salon/news',
'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_world.rss',
'http://rss.cnn.com/rss/edition_us.rss']
def stripHTML(h):
p=''
s=0
for c in h:
if c=='<': s=1
elif c=='>':
s=0
p+=' '
elif s==0: p+=c
return p
def separatewords(text):
splitter=re.compile('\\W*')
return [s.lower( ) for s in splitter.split(text) if len(s)>3]
def getarticlewords( ):
allwords={}
articlewords=[]
articletitles=[]
ec=0
# Loop over every feed
for feed in feedlist:
f=feedparser.parse(feed)
# Loop over every article
for e in f.entries:
# Ignore identical articles
if e.title in articletitles: continue
# Extract the words
txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8'))
words=separatewords(txt)
articlewords.append({})
articletitles.append(e.title)
# Increase the counts for this word in allwords and in articlewords
for word in words:
allwords.setdefault(word,0)
allwords[word]+=1
articlewords[ec].setdefault(word,0)
articlewords[ec][word]+=1
ec+=1
return allwords,articlewords,articletitles
def makematrix(allw,articlew):
wordvec=[]
# Only take words that are common but not too common
for w,c in allw.items( ):
if c>3 and c<len(articlew)*0.6:
wordvec.append(w)
# Create the word matrix
l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew]
return l1,wordvec
from numpy import *
def showfeatures(w,h,titles,wordvec,out='features.txt'):
outfile=file(out,'w')
pc,wc=shape(h)
toppatterns=[[] for i in range(len(titles))]
patternnames=[]
# Loop over all the features
for i in range(pc):
slist=[]
# Create a list of words and their weights
for j in range(wc):
slist.append((h[i,j],wordvec[j]))
# Reverse sort the word list
slist.sort( )
slist.reverse( )
# Print the first six elements
n=[s[1] for s in slist[0:6]]
outfile.write(str(n)+'\n')
patternnames.append(n)
# Create a list of articles for this feature
flist=[]
for j in range(len(titles)):
# Add the article with its weight
flist.append((w[j,i],titles[j]))
toppatterns[j].append((w[j,i],i,titles[j]))
# Reverse sort the list
flist.sort( )
flist.reverse( )
# Show the top 3 articles
for f in flist[0:3]:
outfile.write(str(f)+'\n')
outfile.write('\n')
outfile.close( )
# Return the pattern names for later use
return toppatterns,patternnames
用法如下:
>>> import newsfeatures
>>> allw,artw,artt= newsfeatures.getarticlewords( )
>>> artt[1]
u'Fatah, Hamas men abducted freed: sources'
正如你所看到的,这一行产生了新闻标题。
>>> artt[1]
u'Fatah, Hamas men abducted freed: sources'
我想知道的是,程序不仅会显示标题,还会显示feedlist
标题的来源。
有人可以帮忙吗?
谢谢!
答案 0 :(得分:1)
替换
articletitles.append(e.title)
getarticlewords()
中的与
类似articletitles.append(' '.join([e.title, ', from', feed]))